# Email Spam Detection (Spam vs Ham)

This notebook builds a spam classifier using TF‑IDF + Multinomial Naive Bayes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

# Load dataset (note latin-1 to avoid Unicode errors)
df_raw = pd.read_csv('data/spam.csv', encoding='latin-1')
df_raw.head()

In [None]:
# Keep only needed columns
df = df_raw[['v1','v2']].copy()
df.columns = ['label','message']

# Drop missing
df.dropna(inplace=True)

# Map labels
df['label_num'] = df['label'].map({'ham':0, 'spam':1})
df.sample(5)

In [None]:
# Label distribution
ax = sns.countplot(x='label', data=df)
plt.title('Label Distribution')
plt.show()

# Message length feature for quick EDA
df['msg_len'] = df['message'].str.len()
sns.histplot(data=df, x='msg_len', hue='label', bins=50, kde=True)
plt.title('Message Length Distribution by Label')
plt.show()

df[['msg_len','label']].groupby('label').describe()

In [None]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label_num'], test_size=0.2, random_state=42, stratify=df['label_num']
)

# Pipeline: TF-IDF + MultinomialNB
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True, max_df=0.95, min_df=2)),
    ('nb', MultinomialNB(alpha=0.1))
])

pipe.fit(X_train, y_train)

# Predictions
y_pred = pipe.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy:  {acc:.4f}')
print(f'Precision: {prec:.4f}')
print(f'Recall:    {rec:.4f}')
print(f'F1-score:  {f1:.4f}')
print('\nClassification Report:\n', classification_report(y_test, y_pred, target_names=['ham','spam']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['ham','spam'], yticklabels=['ham','spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Save the pipeline
joblib.dump(pipe, 'spam_nb_pipeline.joblib')
print('Saved trained pipeline to spam_nb_pipeline.joblib')

In [None]:
# Helper to test custom messages
def predict_message(msg: str) -> str:
    pred = pipe.predict([msg])[0]
    proba = pipe.predict_proba([msg])[0][pred]
    return f"{'Spam' if pred==1 else 'Not Spam'} (confidence: {proba:.2f})"

print(predict_message('You have WON $5000!! Click here to claim now.'))
print(predict_message('Hi, can we meet tomorrow at 5?'))

## Conclusion
Using TF‑IDF features and Multinomial Naive Bayes yields strong performance for SMS/email spam detection. The trained pipeline is saved as `spam_nb_pipeline.joblib` and can be loaded to classify new messages immediately.