In [1]:
# 1. Setup
# Install dependencies
!pip install --quiet nltk scikit-learn pandas

# Download NLTK resources
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import pandas as pd

# Create SMS dataset
data = [
    ("ham", "Hey, are we still on for lunch today?"),
    ("spam", "URGENT! You've won $1000! Click here now!"),
    ("ham", "Can you pick up milk on your way home?"),
    ("spam", "FREE iPhone! Limited time offer! Call now!"),
    ("ham", "Meeting moved to 3pm tomorrow"),
    ("spam", "Congratulations! You've been selected for a special offer!"),
    ("ham", "Thanks for the birthday wishes!"),
    ("spam", "SALE ALERT: 90% off everything! Don't miss out!"),
    ("ham", "Running late, be there in 10 minutes"),
    ("spam", "You owe $500 in taxes. Pay immediately or face legal action!")
]

df = pd.DataFrame(data, columns=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Hey, are we still on for lunch today?"
1,spam,URGENT! You've won $1000! Click here now!
2,ham,Can you pick up milk on your way home?
3,spam,FREE iPhone! Limited time offer! Call now!
4,ham,Meeting moved to 3pm tomorrow


In [4]:
# 3. Rule-Based Classifier
import re

# Define spam keywords
spam_keywords = {'urgent', 'free', 'offer', 'sale', 'click', 'congratulations', 'winner', 'won', 'alert'}

def rule_based_classifier(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return 'spam' if any(word in spam_keywords for word in tokens) else 'ham'

# Apply rule-based classifier
df['pred_rule'] = df['message'].apply(rule_based_classifier)
df

Unnamed: 0,label,message,pred_rule
0,ham,"Hey, are we still on for lunch today?",ham
1,spam,URGENT! You've won $1000! Click here now!,spam
2,ham,Can you pick up milk on your way home?,ham
3,spam,FREE iPhone! Limited time offer! Call now!,spam
4,ham,Meeting moved to 3pm tomorrow,ham
5,spam,Congratulations! You've been selected for a sp...,spam
6,ham,Thanks for the birthday wishes!,ham
7,spam,SALE ALERT: 90% off everything! Don't miss out!,spam
8,ham,"Running late, be there in 10 minutes",ham
9,spam,You owe $500 in taxes. Pay immediately or face...,ham


In [5]:
# 4. ML-Based Classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.3, random_state=42)

# Vectorize text
vec = CountVectorizer(stop_words='english')
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Train Naïve Bayes
clf = MultinomialNB().fit(X_train_vec, y_train)
df.loc[X_test.index, 'pred_ml'] = clf.predict(X_test_vec)
df

Unnamed: 0,label,message,pred_rule,pred_ml
0,ham,"Hey, are we still on for lunch today?",ham,
1,spam,URGENT! You've won $1000! Click here now!,spam,ham
2,ham,Can you pick up milk on your way home?,ham,
3,spam,FREE iPhone! Limited time offer! Call now!,spam,
4,ham,Meeting moved to 3pm tomorrow,ham,
5,spam,Congratulations! You've been selected for a sp...,spam,spam
6,ham,Thanks for the birthday wishes!,ham,
7,spam,SALE ALERT: 90% off everything! Don't miss out!,spam,
8,ham,"Running late, be there in 10 minutes",ham,ham
9,spam,You owe $500 in taxes. Pay immediately or face...,ham,


In [6]:
# 5. Evaluation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.3, random_state=42)

# Vectorize text
vec = CountVectorizer(stop_words='english')
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Train Naïve Bayes
clf = MultinomialNB().fit(X_train_vec, y_train)
df.loc[X_test.index, 'pred_ml'] = clf.predict(X_test_vec)
df

Unnamed: 0,label,message,pred_rule,pred_ml
0,ham,"Hey, are we still on for lunch today?",ham,
1,spam,URGENT! You've won $1000! Click here now!,spam,ham
2,ham,Can you pick up milk on your way home?,ham,
3,spam,FREE iPhone! Limited time offer! Call now!,spam,
4,ham,Meeting moved to 3pm tomorrow,ham,
5,spam,Congratulations! You've been selected for a sp...,spam,spam
6,ham,Thanks for the birthday wishes!,ham,
7,spam,SALE ALERT: 90% off everything! Don't miss out!,spam,
8,ham,"Running late, be there in 10 minutes",ham,ham
9,spam,You owe $500 in taxes. Pay immediately or face...,ham,


#### 6. Analysis Questions

In [9]:
# Which classifier achieved higher overall accuracy on the test set?

from sklearn.metrics import classification_report, accuracy_score

# Prepare y_true and predictions for test set
y_true = y_test.reset_index(drop=True)
y_pred_rule = df.loc[X_test.index, 'pred_rule'].reset_index(drop=True)
y_pred_ml = df.loc[X_test.index, 'pred_ml'].reset_index(drop=True)

print("--- Rule-Based Classifier Evaluation ---")
print("Accuracy:", accuracy_score(y_true, y_pred_rule))
print(classification_report(y_true, y_pred_rule, target_names=["ham", "spam"]))

print("\n=== ML-Based Classifier Evaluation (Naive Bayes) ===")
print("Accuracy:", accuracy_score(y_true, y_pred_ml))
print(classification_report(y_true, y_pred_ml, target_names=["ham", "spam"]))


--- Rule-Based Classifier Evaluation ---
Accuracy: 1.0
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         1
        spam       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3


=== ML-Based Classifier Evaluation (Naive Bayes) ===
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

         ham       0.50      1.00      0.67         1
        spam       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



Rule-Based may miss spam messages that don’t include the predefined keywords.

ML-Based learns from a broader pattern in the training data and generalizes better.

In [10]:
## Compare precision and recall for the “spam” class in both approaches. Which method better balances false positives vs. false negatives?
from sklearn.metrics import precision_score, recall_score, f1_score

# Ensure true and predicted labels for test set are aligned
y_true = y_test.reset_index(drop=True)
y_pred_rule = df.loc[X_test.index, 'pred_rule'].reset_index(drop=True)
y_pred_ml = df.loc[X_test.index, 'pred_ml'].reset_index(drop=True)

# Focus only on 'spam' class
labels = ['ham', 'spam']
spam_index = labels.index('spam')

# Rule-Based Classifier Metrics
precision_rule = precision_score(y_true, y_pred_rule, pos_label='spam')
recall_rule = recall_score(y_true, y_pred_rule, pos_label='spam')
f1_rule = f1_score(y_true, y_pred_rule, pos_label='spam')

# ML-Based Classifier Metrics
precision_ml = precision_score(y_true, y_pred_ml, pos_label='spam')
recall_ml = recall_score(y_true, y_pred_ml, pos_label='spam')
f1_ml = f1_score(y_true, y_pred_ml, pos_label='spam')

# Print comparison
print("=== Spam Class Comparison ===")
print("Rule-Based Classifier:")
print(f"  Precision: {precision_rule:.2f}")
print(f"  Recall:    {recall_rule:.2f}")
print(f"  F1-Score:  {f1_rule:.2f}")

print("\nML-Based Classifier (Naïve Bayes):")
print(f"  Precision: {precision_ml:.2f}")
print(f"  Recall:    {recall_ml:.2f}")
print(f"  F1-Score:  {f1_ml:.2f}")

# Precision shows how many predicted spams were correct (low = too many false positives).

# Recall shows how many actual spams were detected (low = too many false negatives).

# F1-score balances both.

=== Spam Class Comparison ===
Rule-Based Classifier:
  Precision: 1.00
  Recall:    1.00
  F1-Score:  1.00

ML-Based Classifier (Naïve Bayes):
  Precision: 1.00
  Recall:    0.50
  F1-Score:  0.67


In [11]:
# Display mismatches between actual and predicted labels for both models
comparison_df = df.loc[X_test.index].copy()
comparison_df['actual'] = y_test.reset_index(drop=True)
comparison_df['ml_correct'] = comparison_df['actual'] == comparison_df['pred_ml']
comparison_df['rule_correct'] = comparison_df['actual'] == comparison_df['pred_rule']

# Show mismatches for both
print("Classification Comparison:")
print(comparison_df[['message', 'actual', 'pred_rule', 'rule_correct', 'pred_ml', 'ml_correct']])


Classification Comparison:
                                             message actual pred_rule  \
8               Running late, be there in 10 minutes    NaN       ham   
1          URGENT! You've won $1000! Click here now!   spam      spam   
5  Congratulations! You've been selected for a sp...    NaN      spam   

   rule_correct pred_ml  ml_correct  
8         False     ham       False  
1          True     ham       False  
5         False    spam       False  


Rule-Based NLP
Strengths:

Simple, easy to interpret

No training data needed

Fast and lightweight

Weaknesses:

Doesn’t handle linguistic variation or synonyms

Fragile to changes in phrasing

Can’t generalize (e.g., misses: "Win a prize now!" if "win" not in keyword list)



ML-Based NLP
Strengths:

Learns patterns from data

Generalizes to unseen messages

Adapts to complex spam indicators (e.g., frequency of punctuation, word co-occurrence)

Weaknesses:

Requires labeled training data

Needs preprocessing, training time

Can misclassify rare terms or unbalanced datasets

In [12]:
# 1. Expand Rule-Based Keyword List & Measure Performance
import re
from sklearn.metrics import classification_report

# 📌 Expanded keyword list — covering more realistic spam words
expanded_spam_keywords = {
    'urgent', 'free', 'offer', 'sale', 'click', 'congratulations', 'winner',
    'won', 'alert', 'cash', 'deal', 'buy', 'discount', 'prize', 'selected',
    'limited', 'now', 'act', 'immediately', 'call', 'money', 'win', 'gift'
}

# 🔁 Updated rule-based classifier with expanded keywords
def expanded_rule_based_classifier(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return 'spam' if any(word in expanded_spam_keywords for word in tokens) else 'ham'

# 🧠 Apply updated rule-based classifier
df['pred_rule_expanded'] = df['message'].apply(expanded_rule_based_classifier)

# 📊 Evaluate new rule-based performance
y_true = df.loc[X_test.index, 'label']
y_pred_rule_expanded = df.loc[X_test.index, 'pred_rule_expanded']

print("=== Rule-Based (Expanded Keywords) Classification Report ===")
print(classification_report(y_true, y_pred_rule_expanded, target_names=['ham', 'spam']))


# We expanded the keyword list to catch more types of spam language.
# This improves recall (fewer missed spams), but could slightly reduce precision
# (risking false positives if keywords appear in normal messages).

=== Rule-Based (Expanded Keywords) Classification Report ===
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         1
        spam       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [13]:
# How ML-Based Handles Unseen Patterns?

# New unseen message with indirect spam language
new_message = ["This is your last chance to claim exclusive benefits. Hurry!"]

# Predict with Rule-Based (Expanded)
rule_prediction = expanded_rule_based_classifier(new_message[0])
print("Rule-Based (Expanded) Prediction:", rule_prediction)

# Predict with ML-Based
new_vec = vec.transform(new_message)
ml_prediction = clf.predict(new_vec)[0]
print("ML-Based Prediction:", ml_prediction)

# If the new message doesn’t contain any of the expanded keywords, rule-based will likely fail.
# ML may still catch it as spam if it has learned the phrasing “claim exclusive benefits” or similar from the training data.

Rule-Based (Expanded) Prediction: ham
ML-Based Prediction: ham
