In [1]:
! pip install kagglehub[pandas-datasets]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
import kagglehub
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
print("path to files:",path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 41.9MB/s]

Extracting files...
path to files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1





In [4]:
df = pd.read_csv(path + "/spam.csv", encoding='latin-1')
df = df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)
df.columns = ["label","message"]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
print(df["message"].sample().iloc[0])  # picking random datea to see what things i have to perform in this dataset

Yeah do! DonÛ÷t stand to close tho- youÛ÷ll catch something!


In [6]:
print(df.isnull().sum())

label      0
message    0
dtype: int64


In [7]:
print(df.describe())

       label                 message
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [8]:
print(df.label.value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [9]:
4825-747

4078

In [10]:
# data is imballence
from sklearn.utils import resample
df_majority = df[df.label=="ham"]
df_minority = df[df.label=="spam"]

df_minority_unsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)
df_balanced = pd.concat([df_majority, df_minority_unsampled])
print(df_balanced.label.value_counts())

label
ham     4825
spam    4825
Name: count, dtype: int64


In [11]:
print(df_balanced.shape)

(9650, 2)


In [12]:
df_balanced.label=df_balanced.label.replace({"ham":0,"spam":1})
df_balanced.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...


In [13]:
def convert_lower(text):
  return text.lower()

In [14]:
df_balanced["message"]=df_balanced["message"].apply(convert_lower)

In [15]:
import re
def clean_text(text, remove_numbers=True):
  if not isinstance(text, str):
    raise ValueError("Input Must Be String")

  text = re.sub(r'[^a-zA-Z0-9\s]',' ',text)
  if remove_numbers:
    text = re.sub(r'\d+',' ',text)

  text = re.sub(r'\s+',' ', text ).strip()
  return text

In [16]:
df_balanced["message"] = df_balanced["message"].apply(clean_text)

In [17]:
df_balanced.tail()

Unnamed: 0,label,message
4246,1,text pass to to collect your polyphonic ringto...
3673,1,you have won a nokia i this is what you get wh...
3618,1,p alfie moon s children in need song on ur mob...
3499,1,dorothy kiefer com bank of granite issues stro...
4235,1,u can win of music gift vouchers every week st...


In [18]:
from nltk.stem import PorterStemmer
def stem_text(text):
    if not isinstance(text, str):
        raise ValueError("Input Must Be string")

    ps = PorterStemmer()
    result = []

    # Split text into words first!
    for word in text.split():  # ← This iterates over WORDS
        result.append(ps.stem(word))

    return " ".join(result)

In [19]:
df_balanced.message = df_balanced.message.apply(stem_text)

In [20]:
df_balanced.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i don t think he goe to usf he live around...
6,0,even my brother is not like to speak with me t...


In [21]:
def stem_text_concise(text):
    if not isinstance(text, str):
        raise ValueError("Input Must Be string")

    ps = PorterStemmer()
    return " ".join([ps.stem(word) for word in text.split()])

In [22]:
df_balanced.message = df_balanced.message.apply(stem_text_concise)


In [23]:
df_balanced.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i don t think he goe to usf he live around...
6,0,even my brother is not like to speak with me t...


In [41]:
print(df_balanced.isnull().sum())

label      0
message    0
dtype: int64


In [44]:
print(df_balanced["label"].value_counts())

label
0    4825
1    4825
Name: count, dtype: int64


In [45]:
4825 + 4825

9650

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(df_balanced.message).toarray()


In [47]:
X.shape


(9650, 6295)

In [49]:
y = df_balanced.label

In [50]:
y.shape

(9650,)

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [55]:
print(X_train.shape)
print(y_train.shape)

(7720, 6295)
(7720,)


In [56]:
print(X_test.shape)
print(y_test.shape)

(1930, 6295)
(1930,)


In [62]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score,  confusion_matrix, classification_report

In [63]:
# trying GaussianNB
gs = GaussianNB()
gs.fit(X_train, y_train)
y_predict = gs.predict(X_test)

In [64]:
print(accuracy_score(y_test, y_predict))

0.9295336787564766


In [65]:
print(confusion_matrix(y_predict, y_test))

[[850   1]
 [135 944]]


In [66]:
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93       851
           1       1.00      0.87      0.93      1079

    accuracy                           0.93      1930
   macro avg       0.93      0.94      0.93      1930
weighted avg       0.94      0.93      0.93      1930



In [67]:
# trying Multinomial
ms = MultinomialNB()
ms.fit(X_train, y_train)
y_predict1 = ms.predict(X_test)

In [68]:
print(accuracy_score(y_test,y_predict1))

0.9849740932642487


In [69]:
print(confusion_matrix(y_predict1,y_test))

[[972  16]
 [ 13 929]]


In [70]:
print(classification_report(y_predict1, y_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       988
           1       0.98      0.99      0.98       942

    accuracy                           0.98      1930
   macro avg       0.98      0.99      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [71]:
# testing bernouli
br = BernoulliNB()
br.fit(X_train, y_train)
y_predict2 = br.predict(X_test)

In [72]:
print(accuracy_score(y_test, y_predict2))

0.9813471502590674


In [73]:
print(confusion_matrix(y_predict2,y_test))

[[984  35]
 [  1 910]]


In [74]:
print(classification_report(y_predict2,y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1019
           1       0.96      1.00      0.98       911

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [77]:
ham_messages = [
    "Hey, are we still meeting for lunch at 1 pm?",
    "I'll call you back after the meeting ends.",
    "Can you send me the notes from today's class?",
    "Don't forget to bring your charger tomorrow.",
    "Let's catch up over coffee this weekend!",
    "I just reached home safely.",
    "Happy birthday! Wishing you an amazing year ahead!",
    "Could you please check your email and confirm?",
    "The movie last night was awesome, we should go again.",
    "I will be late to the party, save me some food!"
]

for message in ham_messages:
    X_input = cv.transform([message]).toarray()

    print(f"Testing Message: {message}")

    # Gaussian NB
    pred_gs = gs.predict(X_input)[0]
    print(f"GaussianNB Prediction: {'ham' if pred_gs == 0 else 'spam'}")

    # Multinomial NB
    pred_ms = ms.predict(X_input)[0]
    print(f"MultinomialNB Prediction: {'ham' if pred_ms == 0 else 'spam'}")

    # Bernoulli NB
    pred_br = br.predict(X_input)[0]
    print(f"BernoulliNB Prediction: {'ham' if pred_br == 0 else 'spam'}")

    print("-" * 50)


Testing Message: Hey, are we still meeting for lunch at 1 pm?
GaussianNB Prediction: ham
MultinomialNB Prediction: ham
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Message: I'll call you back after the meeting ends.
GaussianNB Prediction: spam
MultinomialNB Prediction: ham
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Message: Can you send me the notes from today's class?
GaussianNB Prediction: spam
MultinomialNB Prediction: ham
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Message: Don't forget to bring your charger tomorrow.
GaussianNB Prediction: spam
MultinomialNB Prediction: ham
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Message: Let's catch up over coffee this weekend!
GaussianNB Prediction: ham
MultinomialNB Prediction: ham
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Mess

In [78]:
# testing model
spam_messages = [
    "Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!",
    "URGENT! Your account has been compromised. Verify your details immediately at this link.",
    "You've been selected for a free vacation to the Bahamas! Call now to claim your prize.",
    "Earn $500 per day from home. Limited spots available, join today!",
    "Exclusive deal just for you! Buy 1 get 1 free. Hurry, offer expires soon!",
    "FREE entry in 2 a weekly competition to win an iPhone. Text WIN to 80085 now!",
    "You have been pre-approved for a $10,000 loan. No credit check required!",
    "Click this link to get your FREE Amazon voucher before it expires!",
    "Your mobile number has won a lottery of $500,000. Reply to claim immediately!",
    "Get rich quick! Learn the secret to making money online. Limited time offer!"
]

for message in spam_messages:
    X_input = cv.transform([message]).toarray()

    print(f"Testing Message: {message}")

    # Gaussian NB
    pred_gs = gs.predict(X_input)[0]
    print(f"GaussianNB Prediction: {'ham' if pred_gs == 0 else 'spam'}")

    # Multinomial NB
    pred_ms = ms.predict(X_input)[0]
    print(f"MultinomialNB Prediction: {'ham' if pred_ms == 0 else 'spam'}")

    # Bernoulli NB
    pred_br = br.predict(X_input)[0]
    print(f"BernoulliNB Prediction: {'ham' if pred_br == 0 else 'spam'}")

    print("-" * 50)

Testing Message: Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!
GaussianNB Prediction: spam
MultinomialNB Prediction: spam
BernoulliNB Prediction: spam
--------------------------------------------------
Testing Message: URGENT! Your account has been compromised. Verify your details immediately at this link.
GaussianNB Prediction: spam
MultinomialNB Prediction: spam
BernoulliNB Prediction: spam
--------------------------------------------------
Testing Message: You've been selected for a free vacation to the Bahamas! Call now to claim your prize.
GaussianNB Prediction: spam
MultinomialNB Prediction: spam
BernoulliNB Prediction: spam
--------------------------------------------------
Testing Message: Earn $500 per day from home. Limited spots available, join today!
GaussianNB Prediction: ham
MultinomialNB Prediction: spam
BernoulliNB Prediction: ham
--------------------------------------------------
Testing Message: Exclusive deal just for you! Buy 1 ge

In [79]:
# multinomial is winning here lets make file of it
import pickle
def save_model(cv, model, filename):
  pipeline ={
      "vectorizer":cv,
      "model":model,
      "metadata":{
          'created_date':pd.Timestamp.now().strftime("%y-%m-%d"),
          'model_type':type(model).__name__,
          'vectorizer_type':type(cv).__name__
      }
  }
  with open (filename, 'wb') as f:
    pickle.dump(pipeline,f)
    print(f"model saved at {filename}")
    return filename

In [80]:
save_model(cv,ms,"multinomial_spam_detector.pkl")

model saved at multinomial_spam_detector.pkl


'multinomial_spam_detector.pkl'