In [59]:
import urllib
import tarfile
import email
import email.policy
import re
import nltk
import urlextract

import numpy as np

from pathlib import Path
from collections import Counter
from pprint import pprint
from html import unescape

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix

# Getting the data

In [3]:

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)

    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url), ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()

    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

ham_dir, spam_dir = fetch_spam_data()

Downloading datasets/spam/ham.tar.bz2
Downloading datasets/spam/spam.tar.bz2


In [4]:
# Load email file names
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

# Data Exploration

In [6]:
print ("Ham count: ", len(ham_filenames))
print ("Spam count: ", len(spam_filenames))

Ham count:  2500
Spam count:  500


In [10]:
def load_email(filepath):
  with open(filepath, "rb") as f:
    return email.parser.BytesParser(policy=email.policy.default).parse(f)
  
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

# double checking that the count is the same as above
print ("Ham count: ", len(ham_emails))
print ("Spam count: ", len(spam_emails))

Ham count:  2500
Spam count:  500


In [12]:
print (ham_emails[5].get_content().strip())

> I just had to jump in here as Carbonara is one of my favourites to make and 
> ask 
> what the hell are you supposed to use instead of cream? 

Isn't it just basically a mixture of beaten egg and bacon (or pancetta, 
really)? You mix in the raw egg to the cooked pasta and the heat of the pasta 
cooks the egg. That's my understanding.

Martin

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [13]:
print (spam_emails[5].get_content().strip())

A POWERHOUSE GIFTING PROGRAM You Don't Want To Miss! 
 
  GET IN WITH THE FOUNDERS! 
The MAJOR PLAYERS are on This ONE
For ONCE be where the PlayerS are
This is YOUR Private Invitation

EXPERTS ARE CALLING THIS THE FASTEST WAY 
TO HUGE CASH FLOW EVER CONCEIVED
Leverage $1,000 into $50,000 Over and Over Again

THE QUESTION HERE IS:
YOU EITHER WANT TO BE WEALTHY 
OR YOU DON'T!!!
WHICH ONE ARE YOU?
I am tossing you a financial lifeline and for your sake I 
Hope you GRAB onto it and hold on tight For the Ride of youR life!

Testimonials

Hear what average people are doing their first few days:
�We've received 8,000 in 1 day and we are doing that over and over again!' Q.S. in AL
 �I'm a single mother in FL and I've received 12,000 in the last 4 days.� D. S. in FL
�I was not sure about this when I sent off my $1,000 pledge, but I got back $2,000 the very next day!� L.L. in KY
�I didn't have the money, so I found myself a partner to work this with. We have received $4,000 over the last 2 days

In [23]:
# Exploring email structures

def get_email_structure(email):
  # text/plain email type
  if isinstance(email, str):
    return email
  
  payload = email.get_payload()
  # multipart email type (with types embedded)
  if isinstance(payload, list):
    # attachments can have emails inside of them
    multipart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
    
    return f"multipart({multipart})"

  # other email type
  else:
    return email.get_content_type()
  
def structures_counter(emails):
  structures = Counter()
  for email in emails:
    structure = get_email_structure(email)
    structures[structure] += 1
  
  return structures

print ("Ham emails most common structures:")
pprint (structures_counter(ham_emails).most_common()[:5])

print ("-"*60)

print ("Spam emails most common structures:")
pprint (structures_counter(spam_emails).most_common()[:5])


Ham emails most common structures:
[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3)]
------------------------------------------------------------
Spam emails most common structures:
[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19)]


We can see that the email structure has useful information in it that can help classification. The ham emails are mostly text plain
and some have pgp signatures while no spam emails have pgp signatures. 

In [24]:
# Exploring email headers

for header, value in spam_emails[0].items():
  print (header, ":", value)


Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

# Data Preprocessing

In [27]:
#Train / test split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Parse HTML in emails

def html_to_plain_text(html):
  # remove head section
  text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)

  # replace all <a>...</a> with HYPERLINK
  text = re.sub('<a\s.*?>', 'HYPERLINK ', text, flags=re.M | re.S | re.I)

  # remove all HTML tags
  text = re.sub('<.*?>', '', text, flags=re.M | re.S)

  # remove white spaces
  text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)

  # unescape html (e.g &gt or &nbsp)
  return unescape(text)



In [38]:
# Getting all html type emails (only spam category had any)
html_spam_emails = [email for email in X_train[y_train == 1] if get_email_structure(email) == "text/html"]

# Seeing one of them before and after the html_to_plain_text function
sample_html_spam = html_spam_emails[7]
print ("BEFORE html_to_plain_text: ")
print (sample_html_spam.get_content().strip()[:1000], "...")

print ("-" * 200)

print ("AFTER html_to_plain_text: ")
print (html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


BEFORE html_to_plain_text: 
<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><ta

In [39]:
# Converting an email object to plain text (emails are python objects)

def email_to_text(email):
  html = None
  for part in email.walk():
    ctype = part.get_content_type()
    
    # email types we're not handling
    if not ctype in ("text/plain", "text/html"):
      continue

    try:
      content = part.get_content()
    except:
      # in case of encoding issues
      content = str(part.get_payload())
    
    if ctype == "text/plain":
      return content
    else:
      html = content
  
  if html:
    return html_to_plain_text(html)

print (email_to_text(sample_html_spam)[:100], "...")



OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Wat ...


In [48]:
# Stemming the text 
stemmer = nltk.PorterStemmer()

In [46]:
# Extracting URLs from the text
url_extractor = urlextract.URLExtract()

# example usage
some_text = "Will it detect github.com and https://kdjalsdjk.com?"
url_extractor.find_urls(some_text)

['github.com', 'https://kdjalsdjk.com']

In [68]:
# Putting together all our transformations above into a transformer. It cleans the emails and parses them into text

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
               replace_urls=True, replace_numbers=True, stemming=True) -> None:
      self.strip_headers = strip_headers
      self.lower_case = lower_case
      self.remove_punctuation = remove_punctuation
      self.replace_urls = replace_urls
      self.replace_numbers = replace_numbers
      self.stemming = stemming

  def fit(self, X, y=None):
     return self
  
  def transform(self, X, y=None):
     X_transformed = []
     for email in X:
        # get the email's text
        text = email_to_text(email)
        
        if not text: 
           text = " "

        # lower case text
        if self.lower_case:
           text = text.lower()
        
        # replace URLs with the word URL
        if self.replace_urls and url_extractor is not None:
           urls = list(set(url_extractor.find_urls(text)))
           for url in urls:
              text = text.replace(url, " URL ")
        
        # replace numbers with the word NUMBER
        if self.replace_numbers:
           text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)

        word_counts = Counter(text.split())

        if self.stemming and stemmer is not None:
           stemmed_word_counts = Counter()
           for word, count in word_counts.items():
              stemmed_word = stemmer.stem(word)
              stemmed_word_counts[stemmed_word] += count
           word_counts = stemmed_word_counts
        
        X_transformed.append(word_counts)
     
     return np.array(X_transformed)
        

In [63]:
# trying out the transformer on some email examples

X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
print (X_few_wordcounts)

[Counter({'chuck': 1, 'murcko': 1, 'wrote:': 1, '>[...stuff...]': 1, 'yawn.': 1, 'r': 1})
 Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'to': 3, 'by': 3, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'half': 2, 'teach': 2, 'some': 1, 'interest': 1, 'quotes...': 1, 'url': 1, 'thoma': 1, 'jefferson:': 1, '"i': 1, 'examin': 1, 'known': 1, 'word,': 1, 'i': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'christian': 1, 'redeem': 1, 'feature.': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mythology.': 1, 'million': 1, 'innoc': 1, 'men,': 1, 'women': 1, 'children,': 1, 'sinc': 1, 'introduct': 1, 'christianity,': 1, 'burnt,': 1, 'tortured,': 1, 'fine': 1, 'imprisoned.': 1, 'what': 1, 'ha': 1, 'effect': 1, 'thi': 1, 'coercion?': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrites;': 1, 'support': 1, 'rogueri': 1, 'error': 1, 'over': 1, 'earth."': 1, 'six': 1, 'histor': 1, 'americans,': 1, 'john': 1, 'e.': 1, 'remsburg,': 1, 'letter': 1

In [56]:
# Converting word counts into a matrix: 
# Each row represents an email's word count array
# Each column will indicate the count/absence of a word 

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, vocabulary_size=1000) -> None:
    self.vocabulary_size = vocabulary_size
  
  # Builds a vocabulary of the <vocabulary_size> most common words
  def fit(self, X, y=None):
    total_count = Counter()
    for word_count in X:
      for word, count in word_count.items():
        # capping counts at 10
        total_count[word] += min(count, 10)
    most_common = total_count.most_common()[:self.vocabulary_size]
    self.vocabulary_ = {word: index+1 for index, (word, count) in enumerate(most_common)}

    return self

  # Builds a sparse matrix where each row is an email example and each column 
  # indicates the count/absence of a word 
  def transform(self, X, y=None):
    rows = []
    cols = []
    data = []
    # for each email (row), get the word_count Counter
    for row, word_count in enumerate(X):
      # for each word in the Counter, get the word's index in the vocab and append the 
      # count of the word
      for word, count in word_count.items():
        rows.append(row)
        cols.append(self.vocabulary_.get(word, 0))
        data.append(count)
    
    return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size+1))

In [58]:
# Testing the WordCounterToVectorTransformer on a few examples

# The first column in the matrix represents the 0th index which means words not found in the most common word vocab.
# Each column after represents a word and its count

vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
print (X_few_vectors.toarray())
print (vocab_transformer.vocabulary_)


[[  6   0   0   0   0   0   0   0   0   0   0]
 [101  11   9   8   1   3   0   3   1   2   3]
 [ 64   0   1   2   4   2   5   1   2   1   0]]
{'the': 1, 'of': 2, 'and': 3, 'url': 4, 'to': 5, '>': 6, 'all': 7, 'in': 8, 'on': 9, 'by': 10}


In [72]:
# Building a pipeline from both transformers

preprocess_pipeline = Pipeline([
  ("email_to_wordcount", EmailToWordCounterTransformer()),
  ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)
score.mean()

0.9866666666666667

In [73]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print (f"Precision: ", precision_score(y_test, y_pred))
print (f"Recall: ", recall_score(y_test, y_pred))


Precision:  0.9574468085106383
Recall:  0.9473684210526315


In [None]:
new_examples = [
  ["MUST REPLY NOW!!!"]
]