In [59]:
import plotly_express as px
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

## Import the data

In [2]:
mails = pd.read_csv("datasets\spam_ham_dataset.csv")

## Take a quick look at the dataset

In [3]:
mails.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


I will not need the first two columns: the model will not operate on string category and the first one does not give me any additional information.

In [4]:
mails['label'].value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

I split the data into features and labels:

In [5]:
x, y = mails["text"].values, mails["label_num"].values

In [6]:
len(x[0])

327

## Data cleaning

I will clean the data first a little bit: I will make sure the lower and uppercase starting words meaning the same thing are treated the same way, remove the special characters and numbers. I should also get rid of the "Subject:" at the beginning of each message - not treating it as a stopword, as they should be taken care of as well, but as the starting of each message - I do not want to remove it from the inside of some mails if it happens to occur.

In [7]:
StopWords = stopwords.words("english")

In [8]:
def clean(text):
    text = text[len('subject: '):]
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in StopWords])
    text = re.sub(r'([^a-zA-Z ]+?)',' ', text)
    text = re.sub(' +', ' ', text)
    return text

In [9]:
x = [clean(text) for text in x]

I check the length of the first mail now:

In [10]:
len(x[0])

210

## Get the words

Here I will find how many different words are in the dataset and check their frequency.

In [11]:
counts = Counter()
for sentence in x:
    counts.update(word.strip('') for word in sentence.split())

In [12]:
sorted_counts = counts.most_common()

In [13]:
num_words = len(sorted_counts)
num_words

45630

That's a lot of unique words! I will now check the frequences of their occurences in the mails.

In [15]:
fig = px.histogram(x=counts.values(), range_x=[1,150])
fig.update_layout(xaxis_title="Number of occurences", yaxis_title="Number of words", title="Count of words distribution")
fig.show()

Most of the words are not used even 10 times.

## Split into train and test data

In [69]:
x_train, x_test = x[: int(len(x) * .8)], x[int(len(x) * .8):]
y_train, y_test = y[: int(len(y) * .8)], y[int(len(y) * .8):]

## Encoding - bag of words

In [74]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

## Model training

In [75]:
from sklearn.neural_network import MLPClassifier

In [76]:
model = MLPClassifier().fit(X_train, y_train)

In [77]:
model.score(X_test, y_test)

0.9864734299516909