sms_spam_detection.py


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import string
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Dataset

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", sep='\t')
data.columns = ['label', 'message']

Preprocessing Text

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

data['clean_message'] = data['message'].apply(clean_text)

Converting Text to Numbers

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(data['clean_message'])
y = data['label']

Train/Test Spliting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training Model

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


Evaluateing Model

In [None]:
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9739910313901345

Confusion Matrix:
 [[937  18]
 [ 11 149]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.98      0.98       955
        spam       0.89      0.93      0.91       160

    accuracy                           0.97      1115
   macro avg       0.94      0.96      0.95      1115
weighted avg       0.97      0.97      0.97      1115



Testing with Custom Message

In [None]:
msg = ["Congratulations! You won a free iPhone!"]
msg_vec = cv.transform(msg)
print("\nMessage:", msg[0])
print("Prediction:", model.predict(msg_vec)[0])



Message: Congratulations! You won a free iPhone!
Prediction: spam


In [None]:
pip install pandas scikit-learn nltk streamlit




App.py

In [None]:
import streamlit as st
import pickle
import string
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading model and vectorizer

In [None]:
model = pickle.load(open("spam_model.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

Cleaning function

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)


Streamlit UI

In [None]:
st.title("📱 SMS Spam Detection App")
st.write("Detect whether a message is **Spam** or **Not Spam** using Machine Learning!")

user_input = st.text_area("Enter your message:")

if st.button("Check Message"):
    cleaned = clean_text(user_input)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]

    if prediction == "spam":
        st.error("🚨 This message looks like **Spam!**")
    else:
        st.success("✅ This message looks **Safe (Not Spam)**!")

