<h2>Spam Email Classification Model Implementation</h2>

In [2]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('Datasets/spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [8]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
xtr,xts,ytr,yts = train_test_split(df.Message,df.spam,test_size=0.2)

In [11]:
xtr.shape

(4457,)

In [12]:
xts.shape

(1115,)

In [13]:
type(xtr)

pandas.core.series.Series

In [15]:
xtr[:4]

1364    Yetunde, i'm sorry but moji and i seem too bus...
2577                 In sch but neva mind u eat 1st lor..
5036    How many times i told in the stage all use to ...
4452    And that is the problem. You walk around in "j...
Name: Message, dtype: object

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
v = CountVectorizer()
xtr_cv = v.fit_transform(xtr.values)

In [18]:
xtr_cv.shape

(4457, 7696)

In [19]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zoe', 'zogtorius', 'zouk'],
      shape=(7696,), dtype=object)

In [21]:
v.vocabulary_

{'yetunde': 7650,
 'sorry': 6293,
 'but': 1583,
 'moji': 4522,
 'and': 963,
 'seem': 5952,
 'too': 6931,
 'busy': 1582,
 'to': 6891,
 'be': 1271,
 'able': 757,
 'go': 3158,
 'shopping': 6077,
 'can': 1638,
 'you': 7662,
 'just': 3855,
 'please': 5213,
 'find': 2846,
 'some': 6263,
 'other': 4954,
 'way': 7365,
 'get': 3112,
 'what': 7434,
 'wanted': 7333,
 'us': 7170,
 'forgive': 2944,
 'me': 4371,
 'reply': 5676,
 'free': 2979,
 'via': 7228,
 'yahoo': 7621,
 'messenger': 4420,
 'in': 3626,
 'sch': 5900,
 'neva': 4708,
 'mind': 4447,
 'eat': 2516,
 '1st': 335,
 'lor': 4175,
 'how': 3512,
 'many': 4315,
 'times': 6862,
 'told': 6908,
 'the': 6774,
 'stage': 6397,
 'all': 913,
 'use': 7174,
 'laugh': 4000,
 'not': 4783,
 'listen': 4112,
 'aha': 874,
 'that': 6770,
 'is': 3732,
 'problem': 5392,
 'walk': 7315,
 'around': 1075,
 'julianaland': 3847,
 'oblivious': 4838,
 'going': 3171,
 'on': 4891,
 'say': 5890,
 'same': 5859,
 'things': 6803,
 'constantly': 1982,
 'they': 6799,
 'one': 489

In [22]:
xtr_np = xtr_cv.toarray()

In [23]:
xtr_np[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7696,))

In [24]:
np.where(xtr_np[0]!=0)

(array([ 757,  963, 1271, 1582, 1583, 1638, 2846, 2944, 2979, 3112, 3158,
        3855, 4371, 4420, 4522, 4954, 5213, 5676, 5952, 6077, 6263, 6293,
        6891, 6931, 7170, 7228, 7333, 7365, 7434, 7621, 7650, 7662]),)

In [25]:
xtr[:4]

1364    Yetunde, i'm sorry but moji and i seem too bus...
2577                 In sch but neva mind u eat 1st lor..
5036    How many times i told in the stage all use to ...
4452    And that is the problem. You walk around in "j...
Name: Message, dtype: object

In [27]:
xtr[:4][1364]

"Yetunde, i'm sorry but moji and i seem too busy to be able to go shopping. Can you just please find some other way to get what you wanted us to get. Please forgive me. You can reply free via yahoo messenger."

In [28]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
model = MultinomialNB()

In [31]:
model.fit(xtr_cv,ytr)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [32]:
xts_cv = v.transform(xts)

In [43]:
from sklearn.metrics import classification_report

In [44]:
y_pred = model.predict(xts_cv)

In [45]:
print(classification_report(yts,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       962
           1       0.98      0.93      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [49]:
emails = [
    # 1. Not Spam (Business Communication)
    "Dear John,\n\nI hope this email finds you well. I'm reaching out to confirm our meeting for tomorrow at 2:00 PM to discuss the Q3 budget proposal. Please let me know if you need to reschedule or require any documents ahead of time.\n\nBest regards,\nSarah Connor",

    # 2. Spam (Urgency/Financial Scam)
    "URGENT: Your Account Has Been Compromised! We detected suspicious activity on your bank account. You must click the link below within 2 HOURS to verify your identity and prevent IMMEDIATE CLOSURE. Failure to act will result in loss of funds. **CLICK HERE NOW** for the Security Portal.",

    # 3. Not Spam (E-commerce Confirmation)
    "Order Confirmation #10087\n\nThank you for your recent purchase from TechGadgets! Your order has been placed and will ship within 2 business days. You can track your order status here: [link to tracking page]. Your total charge was $499.99.\n\nThank you,\nTechGadgets Customer Service",

    # 4. Spam (Vague Lottery/Prize)
    "CONGRATULATIONS!!! You are the lucky winner of a FREE LUXURY VACATION! To claim your incredible, all-expenses-paid trip to an exotic location, simply reply to this email with your full name, address, and credit card number. This offer is valid for only **1 day**!",

    # 5. Not Spam (Newsletter/Subscription)
    "Weekly Digest: The Top 5 AI News\n\nRead about the latest developments in artificial intelligence, including new models from Google DeepMind and Open AI. In this week's issue:\n- Article 1: Large Language Models Explained\n- Article 2: Ethical Concerns in Robotics\n\nUnsubscribe link is at the bottom of this email."
]

In [50]:
emails_count = v.transform(emails)

In [51]:
model.predict(emails_count)

array([0, 1, 1, 1, 0])

<h2>Model Implementation (Easy Method using Pipeline)</h2>

In [53]:
from sklearn.pipeline import Pipeline

In [54]:
clf = Pipeline([
    ('vctorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [58]:
clf.fit(xtr,ytr)

0,1,2
,steps,"[('vctorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [59]:
y_pred = clf.predict(xts)

In [60]:
print(classification_report(yts,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       962
           1       0.98      0.93      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [61]:
clf.predict(emails)

array([0, 1, 1, 1, 0])