# Libraries

In [42]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
import ipywidgets as widgets
from IPython.display import display
import pickle

[nltk_data] Downloading package stopwords to C:\Users\FIREFLY
[nltk_data]     LAPTOP'S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load & view Data

In [43]:
file_path = 'Email_Data.csv'
df = pd.read_csv(file_path)
print("Let's we see first Ten rows of our data")
df.head(10)  


Let's we see first Ten rows of our data


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [44]:
print("Let's we see last Ten rows of our data")
df.tail(10)

Let's we see last Ten rows of our data


Unnamed: 0,text,spam
5718,"Subject: altos na gas model kim , i know you ...",0
5719,Subject: power market research i came across ...,0
5720,Subject: re : visit to houston fyi - - - - -...,0
5721,Subject: ees risk management presentations for...,0
5722,Subject: re : vacation vince : i just found ...,0
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [46]:
print(df['spam'].value_counts())

spam
0    4360
1    1368
Name: count, dtype: int64


### Null values

In [47]:
df.isnull().sum()

text    0
spam    0
dtype: int64

# Clean the text data 

In [48]:
def clean_text(text):
    text = re.sub(r'escapenumber', '', text)
    
    text = re.sub(r'\W', ' ', text)
    
    text = text.lower()  
    
    text = re.sub(r'\s+', ' ', text)
    return text

df['text'] = df['text'].apply(clean_text)
df.head(10)

Unnamed: 0,text,spam
0,subject naturally irresistible your corporate ...,1
1,subject the stock trading gunslinger fanny is ...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject do not have money get software cds fro...,1
5,subject great nnews hello welcome to medzonlin...,1
6,subject here s a hot play in motion homeland s...,1
7,subject save your money buy getting this thing...,1
8,subject undeliverable home based business for ...,1
9,subject save your money buy getting this thing...,1


## removing stopwords

In [49]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['text'] = df['text'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject money get software cds software compat...,1
5,subject great nnews hello welcome medzonline s...,1
6,subject hot play motion homeland security inve...,1
7,subject save money buy getting thing tried cia...,1
8,subject undeliverable home based business grow...,1
9,subject save money buy getting thing tried cia...,1


In [36]:
stemmer = PorterStemmer()

def stem_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

df['text'] = df['text'].apply(stem_text)
df.head(10)

Unnamed: 0,text,spam
0,subject natur irresist corpor ident lt realli ...,1
1,subject stock trade gunsling fanni merril muzo...,1
2,subject unbeliev new home made easi im want sh...,1
3,subject 4 color print special request addit in...,1
4,subject money get softwar cd softwar compat gr...,1
5,subject great nnew hello welcom medzonlin sh g...,1
6,subject hot play motion homeland secur invest ...,1
7,subject save money buy get thing tri ciall yet...,1
8,subject undeliver home base busi grownup messa...,1
9,subject save money buy get thing tri ciall yet...,1


### Labels (Spam = 1, Not Spam = 0)

In [50]:

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text']).toarray()

y = df['spam']


### Split the dataset 

In [51]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
model = MultinomialNB()
model.fit(X_train, y_train)


### Model Train

In [53]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9781849912739965
Precision: 0.9818181818181818
Recall: 0.9310344827586207
F1 Score: 0.9557522123893805


In [54]:
with open("spam_email_model.pkl", "wb") as f:
    pickle.dump(model, f)
print("Model has been saved to 'spam_email_model.pkl'")

Model has been saved to 'spam_email_model.pkl'
