In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('spam_ham_dataset.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
#Dropunwanted columns
dataset = dataset.drop(['label_num'], axis=1)

#Drop unnamed columns
unnamed_cols  = dataset.columns.str.contains('Unnamed')
dataset = dataset.drop(dataset[dataset.columns[unnamed_cols]], axis=1)

In [5]:
dataset.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [6]:
dataset.shape

(5171, 2)

In [7]:
dataset.isna().any()

label    False
text     False
dtype: bool

In [8]:
#Remove duplicate columns
dataset.drop_duplicates(inplace =True)

In [9]:
dataset.shape

(4993, 2)

In [10]:
#Convert "ham" to 0 and "spam" to 1
dataset['label'] = dataset.label.map({'ham':0, 'spam':1})
dataset.head()

Unnamed: 0,label,text
0,0,Subject: enron methanol ; meter # : 988291\r\n...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,1,"Subject: photoshop , windows , office . cheap ..."
4,0,Subject: re : indian springs\r\nthis deal is t...


In [11]:
import nltk
from nltk.corpus import stopwords
import string

#Download stopword pacckage
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#remove stopwords and punctuations
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    cleanwords = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return cleanwords

In [13]:
#list of tokens
dataset['text'].head().apply(process_text)

0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
Name: text, dtype: object

In [15]:
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['text'], 
                                                    dataset['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(dataset.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 4993
Number of rows in the training set: 3744
Number of rows in the test set: 1249


In [16]:
#convert text to matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)




In [17]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

In [18]:
predictions = naive_bayes.predict(testing_data)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9791833466773419
Precision score:  0.9656160458452722
Recall score:  0.9601139601139601
F1 score:  0.9628571428571429
