This is a classic binary classification problem where we classify messages as either Spam or Not Spam.

In [3]:
# Import libraries
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [4]:
# 1. Create a small dataset
data = {
    "text": [
        "Congratulations! You won a free iPhone. Click here to claim now!",  # Spam
        "Hey, are we still meeting for lunch today?",  # Not Spam
        "Urgent! Your bank account is compromised. Act now!",  # Spam
        "Don't forget to submit the project by tomorrow.",  # Not Spam
        "You have been selected for a $1000 gift card. Claim it now!",  # Spam
        "Can you send me the notes from class?",  # Not Spam
        "Limited time offer! Get 50% off on all items.",  # Spam
        "Hey, how was your weekend?",  # Not Spam
        "Win a free vacation! Call now to claim your prize.",  # Spam
        "Let's catch up soon!",  # Not Spam
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Spam, 0 = Not Spam
}

df = pd.DataFrame(data)

# 2. Preprocess the text (convert to lowercase, remove special characters)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(preprocess)

# 3. Convert text into numerical features using CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = np.array(df['label'])

In [5]:
# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Train a Naïve Bayes classifier (best for text classification)
model = MultinomialNB()
model.fit(X_train, y_train)

In [6]:
# 6. Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 7. Test with new messages
new_messages = [
    "You have won a free laptop! Claim your prize now.",
    "Hey, want to grab coffee later?",
    "Act fast! Your credit card is at risk.",
    "Don't forget to call mom today."
]

new_messages_clean = [preprocess(msg) for msg in new_messages]
new_features = vectorizer.transform(new_messages_clean)
predictions = model.predict(new_features)

# Print Predictions
for msg, pred in zip(new_messages, predictions):
    category = "Spam" if pred == 1 else "Not Spam"
    print(f"'{msg}' → {category}")

Model Accuracy: 1.00
'You have won a free laptop! Claim your prize now.' → Spam
'Hey, want to grab coffee later?' → Not Spam
'Act fast! Your credit card is at risk.' → Spam
'Don't forget to call mom today.' → Not Spam


### Training on open-sources dataset

In [10]:
import pandas as pd
import urllib.request
import zipfile
import os

# Define URL and local paths
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "smsspamcollection.zip"
data_folder = "sms_data"

# Download the ZIP file
urllib.request.urlretrieve(url, zip_path)

# Unzip the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(data_folder)

# Load the dataset into a pandas DataFrame
data_file = os.path.join(data_folder, "SMSSpamCollection")
df = pd.read_csv(data_file, sep="\t", header=None, names=["label", "text"])

# Preview
print("Sample Data:")
print(df.head())
print(f"\nTotal samples: {len(df)}")

# Optional: convert labels to binary (spam=1, ham=0)
df["label_num"] = df["label"].map({"ham": 0, "spam": 1})


print("--------------------------------------")
print("Cleaning the text data")
df['clean_text'] = df['text'].apply(preprocess)

df.head()


Sample Data:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Total samples: 5572
--------------------------------------
Cleaning the text data


Unnamed: 0,label,text,label_num,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah i don t think he goes to usf he lives arou...


In [11]:
# 3. Convert text into numerical features using CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = np.array(df['label_num'])

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Train a Naïve Bayes classifier (best for text classification)
model = MultinomialNB()
model.fit(X_train, y_train)

In [12]:
# 6. Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 7. Test with new messages
new_messages = [
    "You have won a free laptop! Claim your prize now.",
    "Hey, want to grab coffee later?",
    "Act fast! Your credit card is at risk.",
    "Don't forget to call mom today."
]

new_messages_clean = [preprocess(msg) for msg in new_messages]
new_features = vectorizer.transform(new_messages_clean)
predictions = model.predict(new_features)

# Print Predictions
for msg, pred in zip(new_messages, predictions):
    category = "Spam" if pred == 1 else "Not Spam"
    print(f"'{msg}' → {category}")

Model Accuracy: 0.99
'You have won a free laptop! Claim your prize now.' → Spam
'Hey, want to grab coffee later?' → Not Spam
'Act fast! Your credit card is at risk.' → Not Spam
'Don't forget to call mom today.' → Not Spam


Here we can demonstated training on a short custom dataset (approx. 10 samples) and training on 5k+ samples. 
The perforamnce per samples for the new message are displayed above. 

A particule sentence:  'Hey, want to grab coffee later?' is labelled as "spam" by first classifier (on small custom data) and non-spam by MultinomialNB. 