In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Determining Category wise Count:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
#Numeric representation of Spam and Ham (1 and 0) as a column:
#Inside apply() we supply our transformation function to store 0 or 1 based on ham or spam
df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0) 

df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
#Splitting Main Dataset into Training and Testing Subsets for independent and dependant variables/features:
from sklearn.model_selection import train_test_split

#X_train and X_test will target the message (message body) feature or variable which is independent:
#y_train and y_test will target the spam feature or var with is dependent on message:

X_train , X_test , y_train , y_test = train_test_split(df.Message , df.spam , test_size=0.2) #20% rows in test rest 80% in training

In [9]:
df.shape

(5572, 3)

In [10]:
X_train.shape

(4457,)

In [11]:
X_test.shape

(1115,)

In [12]:
type(X_test)

pandas.core.series.Series

In [14]:
X_train[:5] #Object Containing Index Of Message Row and The Message Body Itself

1881    Just seeing your missed call my dear brother. ...
1324    I thk 50 shd be ok he said plus minus 10.. Did...
4224                      Stupid auto correct on my phone
5508    Machan you go to gym tomorrow,  i wil come lat...
2658                          Dai  &lt;#&gt;  naal eruku.
Name: Message, dtype: object

In [30]:
X_train[:5][1324]

'I thk 50 shd be ok he said plus minus 10.. Did ü leave a line in between paragraphs?'

In [31]:
(X_train.values)

array(['Just seeing your missed call my dear brother. Do have a gr8 day.',
       'I thk 50 shd be ok he said plus minus 10.. Did ü leave a line in between paragraphs?',
       'Stupid auto correct on my phone', ...,
       'Pete,is this your phone still? Its Jenny from college and Leanne.what are you up to now?:)',
       'Que pases un buen tiempo or something like that',
       'All boys made fun of me today. Ok i have no problem. I just sent one message just for fun'],
      dtype=object)

In [19]:
#Using Count Vectorizer Model from SkLearn inorder to build our bag of words:
from sklearn.feature_extraction.text import CountVectorizer

cntVec = CountVectorizer()

#Generating Bag Of Words Model For the X_train Series:
X_train_cntVec = cntVec.fit_transform(X_train.values)

In [21]:
X_train_cntVec.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
X_train_cntVec.shape

(4457, 7701)

In [23]:
cntVec.get_feature_names_out()[1000:1050]

array(['anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'apart', 'apartment', 'apeshit', 'aphex',
       'apo', 'apologetic', 'apologise', 'apology', 'app', 'apparently',
       'appeal', 'appear', 'appendix', 'applebees', 'apples',
       'application', 'apply', 'applyed', 'applying', 'appointment',
       'appointments', 'appreciate', 'approaches', 'approaching',
       'appropriate', 'approve', 'approved', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'ar', 'arabian',
       'arcade', 'archive', 'ard', 'are', 'area', 'aren'], dtype=object)

In [25]:
#Finding the Email Body Words Position In Count Vectorizer Vocabulary
cntVec.vocabulary_

{'just': 3854,
 'seeing': 5939,
 'your': 7663,
 'missed': 4493,
 'call': 1619,
 'my': 4643,
 'dear': 2191,
 'brother': 1538,
 'do': 2370,
 'have': 3346,
 'gr8': 3197,
 'day': 2178,
 'thk': 6798,
 '50': 534,
 'shd': 6021,
 'be': 1280,
 'ok': 4881,
 'he': 3355,
 'said': 5830,
 'plus': 5231,
 'minus': 4480,
 '10': 260,
 'did': 2299,
 'leave': 4041,
 'line': 4106,
 'in': 3617,
 'between': 1345,
 'paragraphs': 5047,
 'stupid': 6500,
 'auto': 1151,
 'correct': 2019,
 'on': 4900,
 'phone': 5154,
 'machan': 4273,
 'you': 7657,
 'go': 3147,
 'to': 6876,
 'gym': 3270,
 'tomorrow': 6904,
 'wil': 7470,
 'come': 1918,
 'late': 4007,
 'goodnight': 3174,
 'dai': 2148,
 'lt': 4231,
 'gt': 3247,
 'naal': 4653,
 'eruku': 2626,
 'error': 2623,
 'argh': 1054,
 '3g': 462,
 'is': 3723,
 'spotty': 6360,
 'anyway': 1004,
 'the': 6761,
 'only': 4912,
 'thing': 6788,
 'remember': 5646,
 'from': 3005,
 'research': 5685,
 'we': 7371,
 'was': 7344,
 'that': 6758,
 'province': 5444,
 'and': 963,
 'sterling': 6426,


In [26]:
X_train_np = X_train_cntVec.toarray() #Converting to NumPy Array

In [28]:
np.where(X_train_np[0] != 0)

(array([1538, 1619, 2178, 2191, 2370, 3197, 3346, 3854, 4493, 4643, 5939,
        7663], dtype=int64),)

In [33]:
cntVec.get_feature_names_out()[1619]

'call'

In [34]:
#Using Naive Bayes Classifier Model For Email Spam Detection:
from sklearn.naive_bayes import MultinomialNB

nbModel = MultinomialNB()

nbModel.fit(X_train_cntVec , y_train)

In [35]:
X_test_cntVec = cntVec.transform(X_test)

In [40]:
#Evaluating Performance Of the Model:
#Whenever We have an imbalanced dataset (eg : this ds has disproportional spam and not-spam email bodies), we should use classification_report
#(containing f1 score)
from sklearn.metrics import classification_report

#Prediction Of Test Subset:
y_predicted = nbModel.predict(X_test_cntVec)

#Printing Classification Performance report:
print(classification_report(y_test , y_predicted)) # 0 -> Spam and 1 -> Not Spam

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       953
           1       0.97      0.95      0.96       162

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [45]:
#testing the prediction ability of model:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_cntVec = cntVec.transform(emails)
y_predicted = nbModel.predict(emails_cntVec)

for elem in y_predicted:
    if(elem):
        print("Spam")
    else:
        print("Not Spam")

Not Spam
Spam


In [46]:
#A very shorter and easier way of doing this entire thing using sklearn pipeline:
from sklearn.pipeline import Pipeline

classifier_model = Pipeline([
    ('vectoriser' , CountVectorizer()),
    ('nb' , MultinomialNB())
])

In [49]:
#not need to pass countVector as its automatically handled only fit data to train:
classifier_model.fit(X_train , y_train)

In [50]:
#testing model performance:
y_pred = classifier_model.predict(X_test)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       953
           1       0.97      0.95      0.96       162

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

