In [86]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [87]:
# LOAD & CLEAN DATA
url = "https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv"

# Load data
sms_data = pd.read_csv(url, encoding='latin-1')

# Keep only needed columns (drop NaN columns)
sms_data = sms_data[['v1', 'v2']]

# Rename for clarity
sms_data.columns = ['label', 'text']

# Check data
print("=" * 50)
print("DATA OVERVIEW")
print("=" * 50)
print(f"Shape: {sms_data.shape}")
print(f"\nFirst 5 rows:")
print(sms_data.head())
print(f"\nClass distribution:")
print(sms_data['label'].value_counts())
print("=" * 50)

DATA OVERVIEW
Shape: (5572, 2)

First 5 rows:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Class distribution:
label
ham     4825
spam     747
Name: count, dtype: int64


In [88]:
# CONVERT TEXT TO NUMBERS (TF-IDF)
vectorizer = TfidfVectorizer()

# X = features (the text messages converted to numbers)
X = vectorizer.fit_transform(sms_data['text'])

# y = target (spam or ham labels)
y = sms_data['label']

print(f"\nX shape: {X.shape}")  # (5572, thousands_of_features)
print(f"y shape: {y.shape}")    # (5572,)



X shape: (5572, 8672)
y shape: (5572,)


In [89]:
# TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=1
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Training set: (4457, 8672)
Test set: (1115, 8672)


In [90]:
# TRAIN MODEL 1 - LOGISTIC REGRESSION
print("\nðŸ”´ MODEL 1: LOGISTIC REGRESSION")

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))


ðŸ”´ MODEL 1: LOGISTIC REGRESSION
Confusion Matrix:
[[974   2]
 [ 26 113]]

Accuracy: 0.9749

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       976
        spam       0.98      0.81      0.89       139

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [91]:
# TRAIN MODEL 2 - NAIVE BAYES
print("\nðŸ”µ MODEL 2: NAIVE BAYES")

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))



ðŸ”µ MODEL 2: NAIVE BAYES
Confusion Matrix:
[[976   0]
 [ 34 105]]

Accuracy: 0.9695

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       976
        spam       1.00      0.76      0.86       139

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [92]:
# STEP 7: TRAIN MODEL 3 - RANDOM FOREST

print("\nðŸŸ¢ MODEL 3: RANDOM FOREST")

rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))



ðŸŸ¢ MODEL 3: RANDOM FOREST
Confusion Matrix:
[[976   0]
 [ 13 126]]

Accuracy: 0.9883

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       976
        spam       1.00      0.91      0.95       139

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [93]:
# TEST ON NEW MESSAGE

def predict_spam(message):
    """Predict if a new message is spam or ham"""
    # Convert message to TF-IDF
    message_tfidf = vectorizer.transform([message])

    # Predict using best model (Naive Bayes)
    prediction = nb_model.predict(message_tfidf)[0]
    probability = nb_model.predict_proba(message_tfidf)[0]

    print(f"\nMessage: '{message}'")
    print(f"Prediction: {prediction}")
    print(f"Confidence: {max(probability):.2%}")

    return prediction

# Test examples
print("\n" + "=" * 50)
print("ðŸ§ª TESTING NEW MESSAGES")
print("=" * 50)

predict_spam("URGENT! You've won $1000! Click here now!")
predict_spam("Hey, are we still meeting for coffee tomorrow?")
predict_spam("FREE entry to win a prize! Text WIN to 12345")
predict_spam("Can you pick up milk on your way home?")



ðŸ§ª TESTING NEW MESSAGES

Message: 'URGENT! You've won $1000! Click here now!'
Prediction: ham
Confidence: 61.40%

Message: 'Hey, are we still meeting for coffee tomorrow?'
Prediction: ham
Confidence: 99.72%

Message: 'FREE entry to win a prize! Text WIN to 12345'
Prediction: spam
Confidence: 85.31%

Message: 'Can you pick up milk on your way home?'
Prediction: ham
Confidence: 99.77%


np.str_('ham')