In [None]:
# Spam Detection Project – Lite Version

# ---------------------------
# 🔍 Step 1: Load and Preview Dataset
# ---------------------------
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load data (replace with your path)
df = pd.read_csv("sms_spam_detector.csv", encoding='ISO-8859-1')
df = df.rename(columns={'v1': 'label', 'v2': 'text'})
df = df[['label', 'text']]

# Label distribution
sns.countplot(data=df, x='label')
plt.title("Ham vs Spam Distribution")
plt.show()

# ---------------------------
# 🌟 Step 2: Sample Preprocessing Summary
# ---------------------------
# [Preprocessing done: lowercasing, removing punctuation, tokenizing, stopwords removal]

# Create WordClouds (already preprocessed text)
spam_words = ' '.join(df[df['label'] == 'spam']['text'])
ham_words = ' '.join(df[df['label'] == 'ham']['text'])

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.imshow(WordCloud(width=600, height=400, background_color='white').generate(spam_words), interpolation='bilinear')
plt.axis('off')
plt.title("Spam Word Cloud")

plt.subplot(1, 2, 2)
plt.imshow(WordCloud(width=600, height=400, background_color='white').generate(ham_words), interpolation='bilinear')
plt.axis('off')
plt.title("Ham Word Cloud")

plt.tight_layout()
plt.show()

# ---------------------------
# 🔢 Step 3: Model Performance Summary
# ---------------------------
import pandas as pd

summary_df = pd.DataFrame([
    {"Model": "LogisticRegression", "Vectorizer": "CountVectorizer", "Accuracy": 0.9989, "F1 Score": 0.9989},
    {"Model": "MultinomialNB", "Vectorizer": "CountVectorizer", "Accuracy": 0.9950, "F1 Score": 0.9951},
    {"Model": "ComplementNB", "Vectorizer": "CountVectorizer", "Accuracy": 0.9950, "F1 Score": 0.9951},
])

summary_df

# ---------------------------
# ✅ Step 4: Prediction Example (Cleaned + Deployed)
# ---------------------------
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack, csr_matrix
import numpy as np
import string

# Load model components
model = joblib.load("final_model.pkl")
vectorizer = joblib.load("final_vectorizer.pkl")
scaler = joblib.load("final_scaler.pkl")

# Define prediction function
def predict_message(message):
    text = message.lower()
    nopunc = ''.join([char for char in text if char not in string.punctuation])
    words = nopunc.split()

    meta = {
        "clean_text": ' '.join(words),
        "digit_count": sum(char.isdigit() for char in message),
        "ex_count": message.count("!"),
        "has_link": int("http" in message or "www" in message),
        "punct_percent": round(sum(1 for c in message if c in string.punctuation) / len(message), 3) if len(message) > 0 else 0,
    }

    X_text = vectorizer.transform([meta['clean_text']])
    X_meta = scaler.transform([[meta['digit_count'], meta['ex_count'], meta['has_link'], meta['punct_percent']]])
    X_final = hstack([X_text, csr_matrix(X_meta)])

    prob = model.predict_proba(X_final)[0][1]
    return "🚨 Spam" if prob > 0.05 else "✅ Ham"

# Sample prediction
print(predict_message("Win a FREE iPhone now! Click the link."))