### Loading the libraries

In [9]:
import os, time
import numpy as np
import pandas as pd
import tarfile
import urllib

### Loading the data set to local from online resource

In [10]:
## Getting the data url
DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
RAW_PATH = "../datasets/raw/"
EXTRACTED_PATH = "../datasets/extracted/"

In [25]:
def fetch_data(data_name):
    '''
    function to download the data on `data_name` to `data_path`. 
    # Uses 2 arguments:
    - data_name : name of the data
    - data_path : location for the data to be dowloaded
    '''
    
    ##Defining the data url
    data_url = DOWNLOAD_ROOT + data_name
    os.makedirs(RAW_PATH + files[data_name], exist_ok=True)
    tgz_path = os.path.join(RAW_PATH + files[data_name], data_name)
    urllib.request.urlretrieve(data_url, tgz_path)
    data_tgz = tarfile.open(tgz_path, 'r')
    ext_path = EXTRACTED_PATH + files[data_name]
    ##Merging the different files into ham or spam and avoiding subfolders
    for member in data_tgz.getmembers():
        if member.isreg():
            member.name = os.path.basename(member.name)
            data_tgz.extract(member,ext_path)
    data_tgz.close()

In [12]:
files = {'20021010_spam.tar.bz2':'spam',
         '20030228_spam.tar.bz2':'spam',
         '20030228_spam_2.tar.bz2':'spam',
         '20050311_spam_2.tar.bz2':'spam',
         '20021010_easy_ham.tar.bz2':'ham',
         '20021010_hard_ham.tar.bz2':'ham',
         '20030228_easy_ham.tar.bz2':'ham',
         '20030228_easy_ham_2.tar.bz2':'ham',
         '20030228_hard_ham.tar.bz2':'ham'}

In [8]:
#Testing the dictionary
for file in files.keys():
    print(files[file])

spam
spam
spam
spam
ham
ham
ham
ham
ham


In [26]:
##Fetchign the data
for file in files:
    print("Extracting ",file)
    fetch_data(file)

Extracting  20021010_spam.tar.bz2
Extracting  20030228_spam.tar.bz2
Extracting  20030228_spam_2.tar.bz2
Extracting  20050311_spam_2.tar.bz2
Extracting  20021010_easy_ham.tar.bz2
Extracting  20021010_hard_ham.tar.bz2
Extracting  20030228_easy_ham.tar.bz2
Extracting  20030228_easy_ham_2.tar.bz2
Extracting  20030228_hard_ham.tar.bz2


**Data Download Complete**

Checking the count of files extracted

In [33]:
def parse_emails(directory, filename):
    with open(os.path.join(directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = []
spam_emails = []      

dir_name = ''.join([EXTRACTED_PATH,'spam'])
for name in sorted(os.listdir(dir_name)):
    spam_emails.append(parse_emails(dir_name,name))
    
dir_name = ''.join([EXTRACTED_PATH,'ham'])
for name in sorted(os.listdir(dir_name)):
    ham_emails.append(parse_emails(dir_name,name))

In [34]:
print(len(ham_emails))
print(len(spam_emails))

6952
2399


In [35]:
print(100*len(ham_emails)/(len(ham_emails) + len(spam_emails)))
print(100*len(spam_emails)/(len(ham_emails) + len(spam_emails)))

74.34498984065876
25.65501015934125


**75% of the data is HAM emails, and 25% of the data is SPAM Emails**

Checking the email formats for each SPAM and HAM emails

In [29]:
import email
import email.policy

In [35]:
#Testing parsing an email
##https://docs.python.org/3/library/email.message.html

directory = "easy_ham"
filename = "00004.864220c5b6930b209cc287c361c99af1"

with open(os.path.join(EXTRACTED_PATH, directory, filename), "rb") as f:
    test_file = f
    test_email =  email.parser.BytesParser(policy=email.policy.default).parse(f)

 

In [32]:
test_email.get_content()

"Klez: The Virus That Won't Die\n \nAlready the most prolific virus ever, Klez continues to wreak havoc.\n\nAndrew Brandt\n>>From the September 2002 issue of PC World magazine\nPosted Thursday, August 01, 2002\n\n\nThe Klez worm is approaching its seventh month of wriggling across \nthe Web, making it one of the most persistent viruses ever. And \nexperts warn that it may be a harbinger of new viruses that use a \ncombination of pernicious approaches to go from PC to PC.\n\nAntivirus software makers Symantec and McAfee both report more than \n2000 new infections daily, with no sign of letup at press time. The \nBritish security firm MessageLabs estimates that 1 in every 300 \ne-mail messages holds a variation of the Klez virus, and says that \nKlez has already surpassed last summer's SirCam as the most prolific \nvirus ever.\n\nAnd some newer Klez variants aren't merely nuisances--they can carry \nother viruses in them that corrupt your data.\n\n...\n\nhttp://www.pcworld.com/news/artic

In [33]:
test_email.items()

[('Return-Path', '<irregulars-admin@tb.tf>'),
 ('Delivered-To', 'zzzz@localhost.netnoteinc.com'),
 ('Received',
  'from localhost (localhost [127.0.0.1])\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9DAE147C66\tfor <zzzz@localhost>; Thu, 22 Aug 2002 09:23:38 -0400 (EDT)'),
 ('Received',
  'from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 14:23:38 +0100 (IST)'),
 ('Received',
  'from web.tb.tf (route-64-131-126-36.telocity.com    [64.131.126.36]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id    g7MDGOZ07922 for <zzzz-irr@spamassassin.taint.org>; Thu, 22 Aug 2002 14:16:24 +0100'),
 ('Received',
  'from web.tb.tf (localhost.localdomain [127.0.0.1]) by web.tb.tf    (8.11.6/8.11.6) with ESMTP id g7MDP9I16418; Thu, 22 Aug 2002 09:25:09    -0400'),
 ('Received',
  'from red.harvee.home (red [192.168.25.1] (may be forged)) by    web.tb.tf (8.11.6/8.11.6) with ESMTP id g7MDO4I16408 for    <irregulars@tb.tf>; T

In [34]:
test_email.get_payload()

"Klez: The Virus That Won't Die\n \nAlready the most prolific virus ever, Klez continues to wreak havoc.\n\nAndrew Brandt\n>>From the September 2002 issue of PC World magazine\nPosted Thursday, August 01, 2002\n\n\nThe Klez worm is approaching its seventh month of wriggling across \nthe Web, making it one of the most persistent viruses ever. And \nexperts warn that it may be a harbinger of new viruses that use a \ncombination of pernicious approaches to go from PC to PC.\n\nAntivirus software makers Symantec and McAfee both report more than \n2000 new infections daily, with no sign of letup at press time. The \nBritish security firm MessageLabs estimates that 1 in every 300 \ne-mail messages holds a variation of the Klez virus, and says that \nKlez has already surpassed last summer's SirCam as the most prolific \nvirus ever.\n\nAnd some newer Klez variants aren't merely nuisances--they can carry \nother viruses in them that corrupt your data.\n\n...\n\nhttp://www.pcworld.com/news/artic

In [37]:
len(test_email.keys())

28

In [9]:
ham_emails[0]["Subject"]

'Re: New Sequences Window'

In [10]:
spam_emails[5]["Subject"]

'[ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148'

In [23]:
max = 0
for email in spam_emails:
    l = len(email.keys())
    print(l)
    if(l == 28):
        
        print(email.get_content().strip())
#     if(l >= max):
#         max = l
print(max)

0
14
28
1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk007

2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.adclick.ws/p.cfm?o=249&s=pk007

3) Get the Child Support You Deserve - Free Legal Advice
http://www.adclick.ws/p.cfm?o=245&s=pk002

4) Join the Web's Fastest Growing Singles Community
http://www.adclick.ws/p.cfm?o=259&s=pk007

5) Start Your Private Photo Album Online!
http://www.adclick.ws/p.cfm?o=283&s=pk007

Have a Wonderful Day,
Offer Manager
PrizeMama













If you wish to leave this list please use the link below.
http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258


-- 
Irish Linux Users' Group: ilug@linux.ie
http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
List maintainer: listmaster@linux.ie
19
21
28
I thought you might like these:
1) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.freeyankee.com/cgi/fy2/to.cgi?l=822slim1

2) Fight The Risk of Cancer! 
http://www.freeyankee.com/cgi/fy2

KeyError: 'multipart/mixed'

In [24]:
max = 0
for email in ham_emails:
#     l = len(email.keys())
#     print(l)
    print(email.get_content().strip())
#     if(l >= max):
#         max = l
print(max)

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

KeyError: 'multipart/signed'

In [25]:
def email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()
    
from collections import Counter

def structure_count(emails):
    structures = Counter()
    for email in emails:
        structure = email_structure(email)
        structures[structure] += 1
    return structures    

In [26]:
structure_count(ham_emails).most_common()

[('text/plain', 6374),
 ('text/html', 240),
 ('multipart(text/plain, application/pgp-signature)', 173),
 ('multipart(text/plain, text/html)', 109),
 ('multipart(text/plain, text/plain)', 9),
 ('multipart(text/plain)', 6),
 ('multipart(text/plain, application/octet-stream)', 4),
 ('multipart(text/plain, application/x-pkcs7-signature)', 4),
 ('multipart(text/html)', 4),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 3),
 ('multipart(text/plain, multipart(text/plain))', 3),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  3),
 ('multipart(text/plain, text/enriched)', 2),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  2),
 ('multipart(text/plain, video/mng)', 2),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  2),
 ('multipart(text/plain, application/x-java-applet)', 2),
 ('multipart(text/plain, image/bmp)', 2),
 ('mul

In [27]:
structure_count(spam_emails).most_common()

[('text/plain', 1040),
 ('text/html', 953),
 ('multipart(text/plain, text/html)', 204),
 ('multipart(text/html)', 68),
 ('multipart(text/plain)', 63),
 ('multipart(multipart(text/html))', 28),
 ('multipart(text/plain, image/jpeg)', 6),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream)', 4),
 ('multipart(text/html, text/plain)', 4),
 ('multipart(text/html, application/octet-stream)', 4),
 ('multipart/alternative', 3),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 2),
 ('multipart(multipart(text/plain, text/html), image/gif)', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain,

In [None]:
https://colab.research.google.com/github/PseudoCodeNerd/blog/blob/master/_notebooks/2019-10-19-spamClassifier-Oreilly-homework-chapter3.ipynb#scrollTo=zHQGVxveo_0W