In [2]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

import re
import nltk
import string
nltk.download('wordnet')
from nltk.stem import SnowballStemmer
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import os
from urllib.request import urlretrieve
import tarfile
import shutil
import glob

DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

SPAM_1_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'
SPAM_2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2'
EASY_HAM_1_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
EASY_HAM_2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'

def download_dataset(url):
    """download and unzip data from a url into the specified path"""

    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)

    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)

    try:
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)

    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)

        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)

    return dirname

def load_dataset(dirpath):
    """load emails from the specified directory"""

    files = []
    filepaths = glob.glob(dirpath + '/*')
    for path in filepaths:
        with open(path, 'rb') as f:
            byte_content = f.read()
            str_content = byte_content.decode('utf-8', errors='ignore')
            files.append(str_content)
    return files

spam_1_dir = download_dataset(SPAM_1_URL)
spam_2_dir = download_dataset(SPAM_2_URL)
easy_ham_1_dir = download_dataset(EASY_HAM_1_URL)
easy_ham_2_dir = download_dataset(EASY_HAM_2_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)

spam_1 = load_dataset(spam_1_dir)
spam_2 = load_dataset(spam_2_dir)
easy_ham_1 = load_dataset(easy_ham_1_dir)
easy_ham_2 = load_dataset(easy_ham_2_dir)
hard_ham = load_dataset(hard_ham_dir)

In [4]:
print(easy_ham_1[0])

From ilug-admin@linux.ie  Mon Sep  2 13:12:53 2002
Return-Path: <ilug-admin@linux.ie>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 0046D47C84
	for <zzzz@localhost>; Mon,  2 Sep 2002 07:44:57 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Mon, 02 Sep 2002 12:44:57 +0100 (IST)
Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g82B72Z30034 for
    <zzzz-ilug@spamassassin.taint.org>; Mon, 2 Sep 2002 12:07:02 +0100
Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org
    (8.9.3/8.9.3) with ESMTP id MAA00554; Mon, 2 Sep 2002 12:05:51 +0100
Received: from dspsrv.com (vir.dspsrv.com [193.120.211.34]) by
    lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id MAA00506 for <ilug@linux.ie>;
    Mon, 2 Sep 2002 12:05:31 +0100
    [193.120

In [5]:
X = easy_ham_1 + easy_ham_2 + hard_ham + spam_1 + spam_2
len(X)

6047

In [6]:
y = np.concatenate((np.zeros(len(easy_ham_1 + easy_ham_2 + hard_ham)), np.ones(len(spam_1 + spam_2))))
y.shape

(6047,)

In [7]:
def remove_headers(messages):
    header_pattern = re.compile(r'^From .*?\n\n', re.MULTILINE | re.DOTALL)

    cleaned_messages = []
    for message in messages:
        cleaned_message = re.sub(header_pattern, '', message)
        cleaned_messages.append(cleaned_message)

    return cleaned_messages

In [8]:
def replace_urls_with_word(messages):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

    replaced_messages = []
    for message in messages:
        replaced_message = re.sub(url_pattern, "URL", message)
        replaced_messages.append(replaced_message)

    return replaced_messages

In [9]:
def convert_to_lowercase(messages):
    lowercase_messages = []

    for message in messages:
        lowercase_message = message.lower()
        lowercase_messages.append(lowercase_message)

    return lowercase_messages

In [10]:
def replace_numbers_with_word(messages, replacement="NUM"):
    number_pattern = re.compile(r'\d+')

    replaced_messages = []
    for message in messages:
        replaced_message = re.sub(number_pattern, replacement, message)
        replaced_messages.append(replaced_message)

    return replaced_messages

In [11]:
from nltk.stem import WordNetLemmatizer

def remove_word_endings(messages):
    lemmatizer = WordNetLemmatizer()
    processed_messages = []

    for message in messages:
        words = nltk.word_tokenize(message)
        processed_words = [lemmatizer.lemmatize(word) for word in words]
        processed_message = ' '.join(processed_words)
        processed_messages.append(processed_message)

    return processed_messages

In [12]:
import string

def remove_punctuation(messages):
    translator = str.maketrans("", "", string.punctuation)
    processed_messages = []

    for message in messages:
        processed_message = message.translate(translator)
        processed_messages.append(processed_message)

    return processed_messages

In [13]:
new_X = remove_punctuation(
    convert_to_lowercase(
        remove_word_endings(
            replace_numbers_with_word(
                replace_urls_with_word(
                    remove_headers(X))))))

In [14]:
print(new_X[0])

  begin pgp signed message    vincent cunniffe wrote   justin maccarthy wrote     i think i ll ask this question again  a i sent on friday afternoon        mailman      trust me  you do  not  want to running your own mailing list  on your own software    you ll wind up cry in a dark room looking for something  highvoltage to stick your finger into  all thing considered  i get that effect with mailman  but the viable alternative are ezmlm  which is loonware and i m avoiding on principle  and majordomo  which seems to have stagnated  oh  and there s apparently something called smartlist which is a bitch to set up  waider    waider  waiderie  yes  it is very personal of me   begin pgp signature    version  gnupg vnumnumnum  gnulinux  comment  using gnupg with mozilla  url iqevawubpxnenumhbxyzzsabnumaqhnumkqfxqeomahnxalzbmgdnumiydnumvqaagwarnumdq kdpznumnbecrnumosnumpjolynumlspkgnshpjcdzirsxjvxmfpnumyrnqnumaypnumhggwrvwgjgb numnnumhgrgeznumsnumrhunumrauhpfbnumxonumxzminumgskdhsgefqsuoaznum

In [15]:
vectorizer = CountVectorizer()
data_count_matrix = vectorizer.fit_transform(new_X)
data_feature_matrix = data_count_matrix.toarray()
X_vectorized = pd.DataFrame(data=data_feature_matrix,columns = vectorizer.get_feature_names_out())

In [16]:
X_vectorized

Unnamed: 0,aa,aaa,aaaaaa,aaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaqaaaaia,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacqaaayqb,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaghoawnumajwa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaagwakgbjacoa,...,隊x,隊xun,隊xunum,뷽ݵi,죺ʹҵʾչ,쵥λsponsors,ﵽvawry,ﵽvīgρaggyipﵽġc,𬣡b,𬣤jӡa
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6043,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6045,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [18]:
X_train.shape

(4837, 90994)

In [19]:
y_train.shape

(4837,)

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [None]:
print("Precision: ", precision)
print("Recall: ", recall)

Precision:  0.9565217391304348
Recall:  0.4230769230769231


In [30]:
sgd = SGDClassifier(alpha=1e-7)
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [31]:
print("Precision: ", precision)
print("Recall: ", recall)

Precision:  0.9597855227882037
Recall:  0.9445910290237467


In [32]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [33]:
print("Precision: ", precision)
print("Recall: ", recall)

Precision:  0.9261213720316622
Recall:  0.9261213720316622


In [24]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [25]:
print("Precision: ", precision)
print("Recall: ", recall)

Precision:  0.9777158774373259
Recall:  0.9261213720316622


In [28]:
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [29]:
print("Precision: ", precision)
print("Recall: ", recall)

Precision:  0.9755434782608695
Recall:  0.9472295514511874
