In [5]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep="\t", header=None, names=["label", "message"])


def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]+', '', text)
    return text


data["message_clean"] = data["message"].apply(preprocess)
data["label_num"] = data["label"].map({"ham": 0, "spam": 1})

X_train, X_test, y_train, y_test = train_test_split(data["message_clean"], data["label_num"], test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_tf, y_train)

y_pred = model.predict(X_test_tf)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))

def check_email_spam(msg):
    msg_clean = preprocess(msg)
    msg_tf = vectorizer.transform([msg_clean])
    prediction = model.predict(msg_tf)[0]
    return "Spam" if prediction == 1 else "Not Spam"

email = "Congratulations! You've won a free iPhone. Call now."
print("\n🔍 Prediction for custom email:\n", email)
print("➡️ Result:", check_email_spam(email))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Accuracy: 0.957847533632287

📊 Confusion Matrix:
 [[966   0]
 [ 47 102]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98       966
           1       1.00      0.68      0.81       149

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115


🔍 Prediction for custom email:
 Congratulations! You've won a free iPhone. Call now.
➡️ Result: Spam
