# 📧 Email Spam Detection

This notebook uses machine learning to classify emails as spam or not based on email content. We will use a Random Forest Classifier and perform data cleaning, feature selection, preprocessing, model training, and evaluation.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

## 📂 Load Dataset

In [None]:
df = pd.read_csv("email_spam_data.csv")
df.head()

## 🧼 Data Cleaning

In [None]:
df.dropna(inplace=True)  # Drop rows with missing values
df.isnull().sum()  # Check for any remaining null values

## 🎯 Feature Selection and Target Encoding

In [None]:
# Target encoding (Spam = 1, Not Spam = 0)
df['IsSpam'] = df['IsSpam'].map({1: 'Spam', 0: 'Not Spam'})

X = df['EmailContent']  # Feature: Email Content
y = df['IsSpam']  # Target: Spam or Not Spam

## ⚙️ Preprocessing Setup

In [None]:
# Convert the text data into numerical form using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=500)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🤖 Model Training

In [None]:
pipeline = Pipeline([
    ('vectorizer', tfidf),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)

## 📈 Model Evaluation

In [None]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## 💾 Save Model

In [None]:
# Save the trained model and feature list for future use
joblib.dump(pipeline, "spam_classifier_model.pkl")
joblib.dump(X.columns.tolist(), "spam_features.pkl")