<a href="https://colab.research.google.com/github/abdulhalik-ai/email-spam-detection/blob/main/email_spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ----------------------------------------
# Email Spam Detection using Machine Learning
# Author: N. Abdul Halik
# Simple & beginner-friendly project
# ----------------------------------------

# Step 1: Import required libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Step 2: Load the dataset
# Dataset file: spam.csv
# Columns:
# Category -> spam / ham
# Message  -> email text

data = pd.read_csv("spam.csv", encoding="latin-1")

# Select required columns
data = data[['Category', 'Message']]
data.columns = ['label', 'text']

print("Dataset loaded successfully")
print(data.head())


# Step 3: Convert labels to numbers
# spam = 1, ham = 0
data['label'] = data['label'].map({'spam': 1, 'ham': 0})


# Step 4: Split data into input and output
X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data split completed")


# Step 5: Convert text into numbers using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Text converted into numerical format")


# Step 6: Train the Machine Learning model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

print("Model training completed")


# Step 7: Check model accuracy
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy:", accuracy)


# Step 8: Function to test custom email
def predict_email(message):
    message_tfidf = vectorizer.transform([message])
    prediction = model.predict(message_tfidf)

    if prediction[0] == 1:
        return "Spam Email ❌"
    else:
        return "Not Spam Email ✅"


# Example test
sample_email = "Congratulations! You have won a free prize. Click now!"
print("Sample Email Prediction:")
print(predict_email(sample_email))


Dataset loaded successfully
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Data split completed
Text converted into numerical format
Model training completed
Model Accuracy: 0.967713004484305
Sample Email Prediction:
Spam Email ❌
