In [4]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [7]:
data = pd.read_csv('Spam Email raw text for NLP.csv')

In [8]:
data

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CATEGORY   5796 non-null   int64 
 1   MESSAGE    5796 non-null   object
 2   FILE_NAME  5796 non-null   object
dtypes: int64(1), object(2)
memory usage: 136.0+ KB


# Preprocessing

In [10]:
#Drop the FILE_NAME column
data= data.drop('FILE_NAME',axis=1)

In [13]:
#inspect data
data.groupby('CATEGORY').describe()
# 0 -->not spam email
# 1 -->spam email

Unnamed: 0_level_0,MESSAGE,MESSAGE,MESSAGE,MESSAGE
Unnamed: 0_level_1,count,unique,top,freq
CATEGORY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,3900,3862,URL: http://www.askbjoernhansen.com/archives/2...,4
1,1896,1763,\n\nHello I am your hot lil horny toy.\n\n ...,7


In [22]:
#splitting the dataset

X_train,X_test,y_train,y_test = train_test_split(data.MESSAGE,data.CATEGORY,train_size=0.8)

Countvectorizer is a method to convert text to numerical data.
By default, Countvectorizer converts the text to lowercase and uses word-level tokenization.

In [24]:
#find word count and store data as a matrix
cv= CountVectorizer()
x_train_count = cv.fit_transform(X_train.values)

In [56]:
x_train_count

<4636x82759 sparse matrix of type '<class 'numpy.int64'>'
	with 727655 stored elements in Compressed Sparse Row format>

In [59]:
X_count = x_train_count.toarray()
df= pd.DataFrame(data=X_count,columns =cv.get_feature_names_out())
print(df)


      00  000  0000  00000  000000  00000000  000000000  0000000000  \
0      0    0     0      0       0         0          0           0   
1      0    0     0      0       0         0          0           0   
2      0    0     0      0       0         0          0           0   
3      1    0     0      0       0         0          0           0   
4      0    0     0      0       0         0          0           0   
...   ..  ...   ...    ...     ...       ...        ...         ...   
4631   0    0     0      0       0         0          0           0   
4632   0    0     0      0       0         0          0           0   
4633   1    0     0      0       0         0          0           0   
4634   0    0     0      0       0         0          0           0   
4635   0    0     0      0       1         0          0           0   

      00000000000000000  000000000000000000000  ...  þë  þîñ  þö  þööööà  þüg  \
0                     0                      0  ...   0    0   0  

In [62]:
cv.vocabulary_ 
#They are the collection of words in the sparse matrix.
# The numbers do not represent the count of the words but the position of the words in the matrix

{'newsletter': 52832,
 'coach': 23588,
 'invest': 42916,
 'septembre': 66003,
 '2002': 2721,
 'accompagnateur': 13223,
 'des': 27184,
 'jeunes': 44101,
 'entreprises': 31589,
 'www': 78447,
 'coachinvest': 23592,
 'com': 23829,
 'ou': 55437,
 '3615': 4870,
 '34': 4758,
 'euros': 32158,
 'min': 50420,
 'nouvelles': 53517,
 'de': 26601,
 'lancement': 46513,
 'réussi': 64748,
 'pour': 58971,
 'interactive': 42677,
 'qui': 61253,
 'décroche': 29967,
 'ses': 66109,
 'premiers': 59245,
 'contrats': 24678,
 'accompagne': 13224,
 'les': 47032,
 'dans': 26265,
 'la': 46408,
 'création': 25447,
 'le': 46851,
 'développement': 29975,
 'optimisation': 55073,
 'et': 32034,
 'gestion': 36276,
 'au': 16781,
 'quotidien': 61317,
 'leur': 47083,
 'site': 66902,
 'internet': 42752,
 'propose': 59766,
 'clients': 23287,
 'nombreux': 53328,
 'services': 66101,
 'afin': 14047,
 'aider': 14461,
 'concevoir': 24215,
 'mettre': 50077,
 'en': 31282,
 'place': 58306,
 'stratégie': 68860,
 'contact': 24572,
 'hu

In [66]:
cv.get_feature_names_out()

array(['00', '000', '0000', ..., 'ÿÿã',
       'ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿò',
       'ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó'], dtype=object)

In [26]:
#train model
model = MultinomialNB()
model.fit(x_train_count,y_train)

In [28]:
#pre test of not spam mail
email_ham = ['hey lets meet today!']
email_count = cv.transform(email_ham)
model.predict(email_count)

array([0], dtype=int64)

In [29]:
#pre test of spam mail
email_spam = ['click for reward to get money']
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [30]:
#testing model
X_test_count = cv.transform(X_test)
model.score(X_test_count,y_test)

0.9431034482758621