In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
import random

# Sample data
messages = [
    ("Win money now!!!", "spam"),
    ("Hi, can we meet for lunch?", "ham"),
    ("Exclusive deal just for you", "spam"),
    ("Don't forget the meeting at 10am", "ham"),
    ("Limited offer: Get your prize now", "spam"),
    ("Let’s catch up later this week", "ham"),
    ("Congratulations! You've been selected", "spam"),
    ("See you at the conference tomorrow", "ham"),
    ("Claim your free gift card", "spam"),
    ("Are you joining the team dinner tonight?", "ham")
]


In [12]:
# Preprocessing function
stop_words = set(stopwords.words('english'))

def message_features(message):
    words = word_tokenize(message.lower())
    return {f"contains({word})": True for word in words if word.isalpha() and word not in stop_words}

# Create feature sets
feature_sets = [(message_features(text), label) for (text, label) in messages]
random.shuffle(feature_sets)
feature_sets

[({'contains(exclusive)': True, 'contains(deal)': True}, 'spam'),
 ({'contains(win)': True, 'contains(money)': True}, 'spam'),
 ({'contains(claim)': True,
   'contains(free)': True,
   'contains(gift)': True,
   'contains(card)': True},
  'spam'),
 ({'contains(see)': True,
   'contains(conference)': True,
   'contains(tomorrow)': True},
  'ham'),
 ({'contains(forget)': True, 'contains(meeting)': True}, 'ham'),
 ({'contains(joining)': True,
   'contains(team)': True,
   'contains(dinner)': True,
   'contains(tonight)': True},
  'ham'),
 ({'contains(hi)': True, 'contains(meet)': True, 'contains(lunch)': True},
  'ham'),
 ({'contains(congratulations)': True, 'contains(selected)': True}, 'spam'),
 ({'contains(let)': True,
   'contains(catch)': True,
   'contains(later)': True,
   'contains(week)': True},
  'ham'),
 ({'contains(limited)': True,
   'contains(offer)': True,
   'contains(get)': True,
   'contains(prize)': True},
  'spam')]

In [10]:
# Train-test split
train_set, test_set = feature_sets[:7], feature_sets[7:]

# Train the classifier
classifier = NaiveBayesClassifier.train(train_set)


In [15]:

# Evaluate
print("Accuracy:", accuracy(classifier, test_set))
classifier.show_most_informative_features(5)


Accuracy: 0.3333333333333333
Most Informative Features
         contains(catch) = None             spam : ham    =      1.4 : 1.0
        contains(dinner) = None             spam : ham    =      1.4 : 1.0
        contains(forget) = None             spam : ham    =      1.4 : 1.0
       contains(joining) = None             spam : ham    =      1.4 : 1.0
         contains(later) = None             spam : ham    =      1.4 : 1.0


In [17]:


# Predict a custom message
custom_message = "Win a free iPhone now!"
features = message_features(custom_message)
prediction = classifier.classify(features)

print(f"Message: '{custom_message}'\nPrediction: {prediction}")


Message: 'Win a free iPhone now!'
Prediction: spam
