In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Path to dataset files: /kaggle/input/sms-spam-collection-dataset


In [None]:
# SPAM SMS DETECTION ML MODEL


import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from sklearn.naive_bayes import MultinomialNB


data_path = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
df = pd.read_csv(data_path, encoding='latin-1')

df = df[['v1', 'v2']]
df.columns = ['label', 'message']

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# -------------------------------
# Model Evaluation
# -------------------------------
y_pred = model.predict(X_test_tfidf)
y_prob = model.predict_proba(X_test_tfidf)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9730941704035875
ROC-AUC Score: 0.9889775869495742

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       0.98      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:


test_sms = [
    "Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! "
    "C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!"
]

test_sms_tfidf = vectorizer.transform(test_sms)

prediction = model.predict(test_sms_tfidf)
prediction_prob = model.predict_proba(test_sms_tfidf)[0][1]

if prediction[0] == 1:
    print("Prediction: SPAM")
else:
    print("Prediction: HAM")

print("Spam Probability:", prediction_prob)


Prediction: SPAM
Spam Probability: 0.8380302629417337
