In [48]:
import pandas as pd
import numpy as np
import seaborn as sns

In [49]:
df = pd.read_csv('emails.csv')
print(df.head())
print('Rows and columns: ',df.shape)
print("Spams and no spams: ",df['spam'].value_counts())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
Rows and columns:  (5728, 2)
Spams and no spams:  spam
0    4360
1    1368
Name: count, dtype: int64


In [50]:
df.isnull().sum()

Unnamed: 0,0
text,0
spam,0


In [51]:
# Separate x and y
x = df.text.values
y = df.spam.values

In [52]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [53]:
# droping duplicate mails:

df.drop_duplicates(inplace=True)
print(df.shape)

(5695, 2)


In [54]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# Data Preprocessing
by **CountVectorizer**

**CountVectorizer** transforms text into a numerical feature matrix by counting how often each word appears in the documents.

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [56]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# ML Algorithm

By Multinomial Naive Bayes

Naive Bayes is based on **Bayes’ Theorem**, a math rule that helps us figure out the chance of something based on what we already know. Here’s the basic idea:
- You want to know: "Given these features, how likely is this class?"
- Bayes’ Theorem uses probabilities from past data to answer this.

In math terms, Bayes’ Theorem is:

$ P(\text{class}|\text{features}) = \frac{P(\text{features}|\text{class}) \cdot P(\text{class})}{P(\text{features})} $  

and **Multinomial Naive Bayes** is a probabilistic classifier that applies Bayes' theorem on count-based features, assuming all features contribute independently.

In [57]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

In [58]:
x_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [59]:
model.score(x_test,y_test)

0.9895287958115183

# Checking the accuracy by testing emails

In [60]:
emails = ['Hey, I am looking for ML tutorials in bangla','Hey, win an iphone x giveaway for free by doing the survey']

In [61]:
cv_emails = cv.transform(emails)

In [62]:
model.predict(cv_emails)

array([0, 1])

1st mail is ham, 2nd mail is spam