# Email spam Detection with Machine Learning

# Import Required Libraries

In [18]:
# Data manipulation libraries
import pandas as pd

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer


# Text preprocessing libraries
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk


# For saving/loading the model
import pickle


# Load the Dataset

In [19]:
df=pd.read_csv(r"C:\Users\ELCOT\Downloads\data_prac\spam_dataset.csv")

In [20]:
df.columns

Index(['message_content', 'is_spam'], dtype='object')

In [21]:
df.head(3)

Unnamed: 0,message_content,is_spam
0,"Hello Lonnie,\n\nJust wanted to touch base reg...",0
1,"Congratulations, you've won a prize! Call us n...",1
2,You have been pre-approved for a credit card w...,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   message_content  1000 non-null   object
 1   is_spam          1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [23]:
df['is_spam'].value_counts()


is_spam
0    500
1    500
Name: count, dtype: int64

 the dataset is balanced, with 500 non-spam emails (is_spam = 0) and 500 spam emails (is_spam = 1). This is ideal since balanced datasets help models perform well without bias toward one class.

# Download NLTK Data

In [24]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)


df['cleaned_message'] = df['message_content'].apply(preprocess_text)
print(df[['message_content', 'cleaned_message']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELCOT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ELCOT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                     message_content  \
0  Hello Lonnie,\n\nJust wanted to touch base reg...   
1  Congratulations, you've won a prize! Call us n...   
2  You have been pre-approved for a credit card w...   
3  Limited time offer, act now! Only a few spots ...   
4  Your loan has been approved! Transfer funds to...   

                                     cleaned_message  
0  hello lonnie wanted touch base regarding proje...  
1  congratulation prize call u claim account sele...  
2  pre approved credit card high limit special of...  
3  limited time offer act spot left immediate act...  
4  loan approved transfer fund today hurry lifeti...  


# Vectorize the Text Data

In [25]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['cleaned_message'])

y = df['is_spam']

print("Feature matrix shape:", X.shape)


Feature matrix shape: (1000, 1649)


# Split the Data into Training and Testing Sets

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (800, 1649)
Testing set shape: (200, 1649)


# Train the Machine Learning Model
 
Train a MultinomialNB (Naive Bayes) classifier on the training data. This model works well for text classification problems.

In [27]:
model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       100

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



Save the Model and Vectorizer

In [28]:
with open("spam_detector.pkl", "wb") as model_file:
    pickle.dump((model, vectorizer), model_file)

print("Model saved successfully!")


Model saved successfully!


In [29]:
with open("spam_detector.pkl", "rb") as model_file:
    loaded_model, loaded_vectorizer = pickle.load(model_file)

print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!


# Preprocess and Predict on New Email Samples

In [30]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)


new_emails = [
    "Congratulations! You have won a free iPhone. Click here to claim.",
    "Hi, I wanted to follow up on the meeting we scheduled.",
    "Get a loan approved in 24 hours! Act now."
]


new_emails_cleaned = [preprocess_text(email) for email in new_emails]


In [31]:

new_emails_transformed = loaded_vectorizer.transform(new_emails_cleaned)


In [32]:

predictions = loaded_model.predict(new_emails_transformed)

for email, pred in zip(new_emails, predictions):
    print(f"Email: {email}")
    print(f"Prediction: {'Spam' if pred == 1 else 'Not Spam'}")
    print("-" * 50)


Email: Congratulations! You have won a free iPhone. Click here to claim.
Prediction: Spam
--------------------------------------------------
Email: Hi, I wanted to follow up on the meeting we scheduled.
Prediction: Not Spam
--------------------------------------------------
Email: Get a loan approved in 24 hours! Act now.
Prediction: Spam
--------------------------------------------------


This is the full step-by-step process for building a spam detection system from scratch, including preprocessing, model training, evaluation, saving/loading, and predicting new samples.