# Email Span Detection

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [11]:
print(df.shape)
print(df.isna().sum())

(5572, 5)
v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


In [34]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag_of_words = count.fit_transform(df['v2'])

In [50]:
print(bag_of_words.shape)
print(bag_of_words[0].shape)

(5572, 8672)
(1, 8672)


In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X = bag_of_words.toarray()

y = df['v1'].astype(pd.CategoricalDtype())

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

model = GaussianNB()

model.fit(X_train, y_train)

In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.99      0.91      0.95       974
        spam       0.59      0.91      0.72       141

    accuracy                           0.91      1115
   macro avg       0.79      0.91      0.83      1115
weighted avg       0.94      0.91      0.92      1115



In [67]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score, KFold

# model = GaussianNB()
# model = MultinomialNB()
model = BernoulliNB()

X = bag_of_words.toarray()
y = df['v1'].astype(pd.CategoricalDtype())

kf = KFold(n_splits=10, shuffle=True)

cv_scores = cross_val_score(
    estimator = model,
    X=X,
    y=y,
    cv=kf,
    n_jobs=-1    
)

cv_scores.mean()

0.981334337174958