# Using a Multinomial Naive Bayes to detect spam - A trivial example

*Costas Andreopoulos \<c.andreopoulos@cern.ch\>*

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from IPython.display import display, HTML

In [2]:
# Training and test data 
data = pd.DataFrame(
    {
    "text": [
        "Congratulations! You have won a gift card. Click here to claim.",
        "I am prince Ogbuluchukwu from Nigeria. Your help would be very appreciated!",
        "Earn money fast! Get rich with this simple trick.",
        "URGENT: Your Bank Account Has Been Suspended! Verify Now",
        "Your PayPal Account is on Hold! Verify Your Identity Immediately!",
        "Make £10000 a week - No experience needed!",
        "Earn Passive Income While You Sleep",
        "You’ve Won a Free iPhone – Claim Now!",
        "New Cryptocurrency Airdrop! Get FREE Ethereum Before It Runs Out!",
        "Win a Brand New Tesla Model S! Just Answer This One Question!",
        "Einstein was WRONG. Mass and consciousness are fundamentally the same thing!",
        "Shocking! This Celebrity Lost 50 Pounds with This One Trick!",
        "Your Amazon Order Has Been Delayed – Verify Your Details Now!",
        "FREE Bitcoin Giveaway! Claim Your Crypto Before It’s Gone!",
        "Government Grants Available. Claim Your Free Money Now",
        "Accept Our Exclusive Speaker Invitation for the World Summit on Nanomaterials for Space Science",
        "You Have an Unclaimed Inheritance of $2 Million – Contact Us Now!",
        "FINAL NOTICE: Your Netflix Subscription Will Be Suspended – Update Payment Now",        
        "Hey, are we still on for lunch tomorrow?",
        "Can you send me the project files?",
        "Marks overdue!",
        "New mandatory training on Desk Posture Optimization for Maximum Productivity.",
        "University IT Policy Updates: Your Password Must Contain a Haiku.",
        "Proposal deadline next week.",
        "Major new bug found.",
        "Professor, I couldn’t submit my assignment because my laptop died",
        "Free cookies at the semimar room",
        "Review overdue by 3 weeks",
        "Badminton match this weekend",
        "Reminder: Library Books Due Next Week – Renew Online to Avoid Fines",
        "Reminder: Department Meeting Scheduled for Thursday at 10 AM.",
        "Your Monthly Internet Bill is Ready",
        "Request for Letter of Recommendation",
        "Upcoming Faculty Workshop on Effective Online Teaching Strategies",
        "Action Required - Annual Fire Safety Refresher Slides.",
        "Update from the Vice-Chancellor."
    ],
    "label": [
        "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  
        "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",  "spam",
        "legit", "legit", "legit", "legit", "legit", "legit", "legit", "legit", "legit", "legit", 
        "legit", "legit", "legit", "legit", "legit", "legit", "legit", "legit"]
    })

# pd.set_option("display.max_colwidth", None)  # No truncation of column contents
print(data)

                                                 text  label
0   Congratulations! You have won a gift card. Cli...   spam
1   I am prince Ogbuluchukwu from Nigeria. Your he...   spam
2   Earn money fast! Get rich with this simple trick.   spam
3   URGENT: Your Bank Account Has Been Suspended! ...   spam
4   Your PayPal Account is on Hold! Verify Your Id...   spam
5          Make £10000 a week - No experience needed!   spam
6                 Earn Passive Income While You Sleep   spam
7               You’ve Won a Free iPhone – Claim Now!   spam
8   New Cryptocurrency Airdrop! Get FREE Ethereum ...   spam
9   Win a Brand New Tesla Model S! Just Answer Thi...   spam
10  Einstein was WRONG. Mass and consciousness are...   spam
11  Shocking! This Celebrity Lost 50 Pounds with T...   spam
12  Your Amazon Order Has Been Delayed – Verify Yo...   spam
13  FREE Bitcoin Giveaway! Claim Your Crypto Befor...   spam
14  Government Grants Available. Claim Your Free M...   spam
15  Accept Our Exclusive

In [3]:
# Convert text to word frequency vectors
# X is a sparse matrix with shape: (number samples, size of vocabulary)
vectorizer = CountVectorizer()  
X = vectorizer.fit_transform(data["text"])  # returns a _sparse_matrix_ with `Bag of Words' feature vectors
print(f"Shape of X (Examples in `Bag of Words' representation): {X.shape}") 
# print(f"X.toarray(): \n {X.toarray()}") 

# Extract the labels array: 1 for spam, 0 for legit
Y = data["label"].map({"spam": 1, "legit": 0})
print(f"Shape of Y (Example labels): {Y.shape}") # (n_examples, )
# print(f"Y: \n {Y}")

# Print the vocabulary
vocabulary = vectorizer.get_feature_names_out()
print(f"Vocabulary of {len(vocabulary)} words: {vocabulary}\n")

# Convert X and Y to a DataFrame to an interactive HTML table with scrollable output
XY_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
XY_df['*LABEL*'] = Y
html = XY_df.to_html(max_rows=20, max_cols=10)
display(HTML('<style> .dataframe {max-height: 300px; overflow-y: scroll; overflow-x: scroll; display: block; } </style>'))
display(HTML(XY_df.to_html()))

Shape of X (Examples in `Bag of Words' representation): (36, 210)
Shape of Y (Example labels): (36,)
Vocabulary of 210 words: ['10' '10000' '50' 'accept' 'account' 'action' 'airdrop' 'am' 'amazon'
 'an' 'and' 'annual' 'answer' 'appreciated' 'are' 'assignment' 'at'
 'available' 'avoid' 'badminton' 'bank' 'be' 'because' 'been' 'before'
 'bill' 'bitcoin' 'books' 'brand' 'bug' 'by' 'can' 'card' 'celebrity'
 'chancellor' 'claim' 'click' 'congratulations' 'consciousness' 'contact'
 'contain' 'cookies' 'couldn' 'crypto' 'cryptocurrency' 'deadline'
 'delayed' 'department' 'desk' 'details' 'died' 'due' 'earn' 'effective'
 'einstein' 'ethereum' 'exclusive' 'experience' 'faculty' 'fast' 'files'
 'final' 'fines' 'fire' 'for' 'found' 'free' 'from' 'fundamentally' 'get'
 'gift' 'giveaway' 'gone' 'government' 'grants' 'haiku' 'has' 'have'
 'help' 'here' 'hey' 'hold' 'identity' 'immediately' 'income'
 'inheritance' 'internet' 'invitation' 'iphone' 'is' 'it' 'just' 'laptop'
 'letter' 'library' 'lost' '

Unnamed: 0,10,10000,50,accept,account,action,airdrop,am,amazon,an,and,annual,answer,appreciated,are,assignment,at,available,avoid,badminton,bank,be,because,been,before,bill,bitcoin,books,brand,bug,by,can,card,celebrity,chancellor,claim,click,congratulations,consciousness,contact,contain,cookies,couldn,crypto,cryptocurrency,deadline,delayed,department,desk,details,died,due,earn,effective,einstein,ethereum,exclusive,experience,faculty,fast,files,final,fines,fire,for,found,free,from,fundamentally,get,gift,giveaway,gone,government,grants,haiku,has,have,help,here,hey,hold,identity,immediately,income,inheritance,internet,invitation,iphone,is,it,just,laptop,letter,library,lost,lunch,major,make,mandatory,marks,mass,match,maximum,me,meeting,million,model,money,monthly,must,my,nanomaterials,needed,netflix,new,next,nigeria,no,notice,now,of,ogbuluchukwu,on,one,online,optimization,order,our,out,overdue,passive,password,payment,paypal,policy,posture,pounds,prince,productivity,professor,project,proposal,question,ready,recommendation,refresher,reminder,renew,request,required,review,rich,room,runs,safety,same,scheduled,science,semimar,send,shocking,simple,sleep,slides,space,speaker,still,strategies,submit,subscription,summit,suspended,teaching,tesla,the,thing,this,thursday,to,tomorrow,training,trick,unclaimed,university,upcoming,update,updates,urgent,us,ve,verify,very,vice,was,we,week,weekend,weeks,while,will,win,with,won,workshop,world,would,wrong,you,your,*LABEL*
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
8,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [4]:
# Split the data in a training and a test sample
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [5]:
# Create and train a multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, Y_train)

**Make predictions for the test set and check the accuracy**

Print the classification report, which includes the following metrics for each class:
 * $\displaystyle \text{Precision} = \frac{\text{True positives}}{\text{True positives + False positives}}$
   (when the model predicts a class, how often is it correct? "*Purity*" in HEP lingo)

   
 * $\displaystyle \text{Recall} = \frac{\text{True positives}}{\text{True positives + False negatives}}$
   (how many of the actual positive cases were correctly identified? "*Efficiency*" in HEP lingo)

   
 * $\displaystyle \text{F1-score} = 2 \frac{\text{Precision } \times \text{ Recall}}{\text{Precision } + \text{ Recall}}$
   (harmonic mean of Precision and Recall, ensures balance between the two)

 * $\displaystyle \text{Support}$: The number of true instances per class

In [6]:
# Make predictions for the test set and check the accuracy
Y_pred = clf.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {100*accuracy:.2f}%")

print("Classification Report:")
print(classification_report(Y_test, Y_pred))

Accuracy: 75.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.80      0.80      0.80         5

    accuracy                           0.75         8
   macro avg       0.73      0.73      0.73         8
weighted avg       0.75      0.75      0.75         8



In [7]:
new_messages = ["Win a free iPhone! Click this link now.", "Hey, let's catch up this weekend."]
new_X = vectorizer.transform(new_messages)  # Transform using the same BoW model

predictions = clf.predict(new_X)
print(predictions)  # 1 = Spam, 0 = Not Spam

[1 0]
