# Hum or Spam email detection system using Naive Bayes

### Prepared and coded by:
#### **** Amanuel Mihiret (Aman)

### 00. Install dependencies

In [58]:
#Install required dependencies
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install joblib



### 0. Import the installed dependencies

In [61]:
# Import the installed required libraries
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


### 1. Load and visualize the dataset

In [42]:
# Read the dataset using pandas 
dataset_df = pd.read_csv('dataset/spam_or_not_spam.csv')

In [43]:
dataset_df = dataset_df.dropna()

In [44]:
# Visualize the df
dataset_df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0.0
1,martin a posted tassos papadopoulos the greek ...,0.0
2,man threatens explosion in moscow thursday aug...,0.0
3,klez the virus that won t die already the most...,0.0
4,in adding cream to spaghetti carbonara which ...,0.0


In [45]:
# Visualize the df
dataset_df.tail()

Unnamed: 0,email,label
3027,abc s good morning america ranks it the NUMBE...,1.0
3028,hyperlink hyperlink hyperlink let mortgage le...,1.0
3029,thank you for shopping with us gifts for all ...,1.0
3030,the famous ebay marketing e course learn to s...,1.0
3031,hello this is chinese traditional 子 件 NUMBER世...,1.0


In [46]:
# Visualize the df for both labels
dataset_df.groupby('label').describe()

Unnamed: 0_level_0,email,email,email,email
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.0,2500,2445,url URL date not supplied URL,10
1.0,499,427,lowest rates available for term life insurance...,5


### 2. Split and clean the dataset/df

In [47]:
# Split the df as training and testing
x_train, x_test, y_train, y_test = train_test_split(dataset_df.email, dataset_df.label, test_size = 0.25)

In [48]:
print(x_train.shape)

(2249,)


In [49]:
x_train

1134    once upon a time axel wrote has anyone an answ...
1145     from scott lipcon slipcon mercea net date mon...
2612     guaranteed to increase lift and firm your bre...
2493    on mon nov NUMBER NUMBER at NUMBER NUMBER NUMB...
363     we met a family in our parent baby group with ...
                              ...                        
2699     do you want to teach and grow rich if you are...
388      from robert elz kre munnari oz au date thu NU...
2927     hyperlink URL has teamed up with hyperlink fo...
2981    new account for zzzz URL adult club offers fre...
940     interesting it s possible of course that the s...
Name: email, Length: 2249, dtype: object

In [50]:
# Check for NaN values
nan_indices = y_train.index[y_train.isna()]

In [51]:
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [52]:
x_train_count

<2249x25724 sparse matrix of type '<class 'numpy.int64'>'
	with 250282 stored elements in Compressed Sparse Row format>

In [53]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### 3. Define and train the NB model

In [54]:
# Define the model
model = MultinomialNB()

In [55]:
# Train the model
model.fit(x_train_count, y_train) 

### 4. Save the model and the Vectorizer

In [62]:
# Save the model to a file
joblib.dump(model, 'spam_classifier_model.joblib')

['spam_classifier_model.joblib']

In [64]:
# Assuming vectorizer is your trained CountVectorizer
joblib.dump(cv, 'vectorizer.pkl')

['vectorizer.pkl']

##                                                                 **** End ****