### Fetch Data

In [1]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "classification"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

In [2]:
import tarfile
import urllib


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (
        ("easy_ham", "ham", ham_url),
        ("spam", "spam", spam_url),
    ):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urlextract import URLExtract

In [4]:
ham_dir, spam_dir = fetch_spam_data()

In [5]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [6]:
for f in sorted(ham_dir.iterdir())[:5]:
    print(f.name)

00001.7c53336b37003a9286aba55d2945844c
00002.9c4069e25e1ef370c078db7ee85ff9ac
00003.860e3c3cee1b42ead714c5c874fe25f7
00004.864220c5b6930b209cc287c361c99af1
00005.bf27cdeaf0b8c4647ecd61b1d09da613


In [7]:
len(ham_filenames)

2500

In [8]:
len(spam_filenames)

500

We have move of the Negative instance 0 (ham a.k.a not-spam), than the positive instance 1 (spam)

In [9]:
# parse email text files
from email import policy
from email.parser import BytesParser

def load_email_file(f):
    with open(f, "rb") as fb:
        return BytesParser(policy=policy.default).parse(fb)
    
ham_emails = [load_email_file(f) for f in ham_filenames]
spam_emails = [load_email_file(f) for f in spam_filenames]

In [10]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [11]:
print(spam_emails[1].get_content().strip())

1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk007

2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.adclick.ws/p.cfm?o=249&s=pk007

3) Get the Child Support You Deserve - Free Legal Advice
http://www.adclick.ws/p.cfm?o=245&s=pk002

4) Join the Web's Fastest Growing Singles Community
http://www.adclick.ws/p.cfm?o=259&s=pk007

5) Start Your Private Photo Album Online!
http://www.adclick.ws/p.cfm?o=283&s=pk007

Have a Wonderful Day,
Offer Manager
PrizeMama













If you wish to leave this list please use the link below.
http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258


-- 
Irish Linux Users' Group: ilug@linux.ie
http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
List maintainer: listmaster@linux.ie


In [12]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = " ,".join(
            [get_email_structure(part) for part in payload]
        )
        return f"multipart({multipart})"
    else:
        return email.get_content_type()
    


print(get_email_structure(spam_emails[383]))

multipart(text/plain ,text/html)


In [13]:
from collections import Counter


def email_structure_counter(emails):
    structure_list = [get_email_structure(email) for email in emails]
    return Counter(structure_list)

email_structure_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain ,application/pgp-signature)', 66),
 ('multipart(text/plain ,text/html)', 8),
 ('multipart(text/plain ,text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain ,application/octet-stream)', 2),
 ('multipart(text/plain ,text/enriched)', 1),
 ('multipart(text/plain ,application/ms-tnef ,text/plain)', 1),
 ('multipart(multipart(text/plain ,text/plain ,text/plain) ,application/pgp-signature)',
  1),
 ('multipart(text/plain ,video/mng)', 1),
 ('multipart(text/plain ,multipart(text/plain))', 1),
 ('multipart(text/plain ,application/x-pkcs7-signature)', 1),
 ('multipart(text/plain ,multipart(text/plain ,text/plain) ,text/rfc822-headers)',
  1),
 ('multipart(text/plain ,multipart(text/plain ,text/plain) ,multipart(multipart(text/plain ,application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain ,application/x-java-applet)', 1)]

In [14]:
email_structure_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain ,text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain ,image/jpeg)', 3),
 ('multipart(text/html ,application/octet-stream)', 2),
 ('multipart(text/plain ,application/octet-stream)', 1),
 ('multipart(text/html ,text/plain)', 1),
 ('multipart(multipart(text/html) ,application/octet-stream ,image/jpeg)', 1),
 ('multipart(multipart(text/plain ,text/html) ,image/gif)', 1),
 ('multipart/alternative', 1)]

We can deduce that the hams are mostly in `text/plain` format

In [15]:
for header, value in spam_emails[200].items():
    print(header, ":", value)

Return-Path : <adinebook@netscape.net>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 897B544155	for <zzzz@localhost>; Mon,  2 Sep 2002 11:26:40 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Mon, 02 Sep 2002 16:26:40 +0100 (IST)
Received : from solpdc1.saudionline.com.sa ([213.238.13.107]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g81KKsZ02982 for    <webmaster@efi.ie>; Sun, 1 Sep 2002 21:20:54 +0100
Received : from tsvm13.saudionline.com.sa (TSVM13 [10.1.3.5]) by    solpdc1.saudionline.com.sa with SMTP (Microsoft Exchange Internet Mail    Service Version 5.5.2653.13) id SABYLXXN; Sun, 1 Sep 2002 23:21:56 +0300
Received : from smcjednt.saudi-maritime.com ([213.238.2.116]) by    tsvm13.saudionline.com.sa  with Microsoft SMTPSVC(5.5.1875.185.18);    Sun, 1 Sep 2002 23:32:42 +0300
Rec

In [16]:
for header, value in ham_emails[0].items():
    print(header, ":", value)

Return-Path : <exmh-workers-admin@spamassassin.taint.org>
Delivered-To : zzzz@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received : from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100
Received : from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002    07:35:02 -0400 (EDT)
Delivered-To : exmh-workers@listman.spamassassin.taint.org
Received : from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org 

In [17]:
X = np.array(ham_emails + spam_emails, dtype=object)
X.shape
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
y.shape


(3000,)

In [18]:
df = zip(X, y)
pd.DataFrame(df, columns=["email", "label"])

Unnamed: 0,email,label
0,"[Return-Path, Delivered-To, Received, Received...",0
1,"[Return-Path, Delivered-To, Received, Received...",0
2,"[Return-Path, Delivered-To, Received, Received...",0
3,"[Return-Path, Delivered-To, Received, Received...",0
4,"[Return-Path, Delivered-To, Received, Received...",0
...,...,...
2995,"[Return-Path, Delivered-To, Received, Received...",1
2996,"[Return-Path, Delivered-To, Received, Received...",1
2997,"[Return-Path, Delivered-To, Received, Received...",1
2998,"[Return-Path, Delivered-To, Received, Received...",1


In [19]:
def html_to_plain_text(html):
    """Convert HTML to clean plain text"""
    soup = BeautifulSoup(html, 'lxml')
    
    # Get text with separator between elements
    text = soup.get_text(separator=' ', strip=True)
    
    return text

def urlExtractor(text):
    extractor = URLExtract()
    return extractor.find_urls(text)

In [20]:
html = "<p>Hello, visit <a href='https://example.com'>Example</a></p>"
soup = BeautifulSoup(html, "lxml")
print(soup.text)

text = "Here are some links: https://google.com and https://openai.com"
extractor = URLExtract()
print(extractor.find_urls(text))


Hello, visit Example
['https://google.com', 'https://openai.com']


In [21]:
print(html_to_plain_text(ham_emails[10].get_content()))

Hello, have you seen and discussed this article and his approach?

Thank you

http://www.paulgraham.com/spam.html
-- "Hell, there are no rules here-- we're trying to accomplish something."
-- Thomas Alva Edison




-------------------------------------------------------
This sf.net email is sponsored by: OSDN - Tired of that same old
cell phone?  Get a new here for FREE!
https://www.inphonic.com/r.asp?r=sourceforge1&refcode1=vs3390
_______________________________________________
Spamassassin-devel mailing list
Spamassassin-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/spamassassin-devel


In [22]:
urlExtractor(ham_emails[10].get_content())

['http://www.paulgraham.com/spam.html',
 'sf.net',
 'https://www.inphonic.com/r.asp?r=sourceforge1&refcode1=vs3390',
 'https://lists.sourceforge.net/lists/listinfo/spamassassin-devel']

In [25]:
from nltk import PorterStemmer, SnowballStemmer

stemmer = PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute",
             "Compulsive"):
    print(word, "=>", stemmer.stem(word))



Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [None]:
def preprocess_email_text(text):
    """Clean and stem email text"""
    stemmer = SnowballStemmer("english")
    # Convert to lowercase
    text = text.lower()
    # Split into words
    words = text.split()
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join back
    return " ".join(stemmed_words)


# Usage
email_text = "Congratulations! You are the winner of our amazing prize!"
processed = preprocess_email_text(email_text)
print(processed)

congratulations! you are the winner of our amaz prize!
