## Method to extract required infomation from email files

In [15]:
import pandas as pd
import numpy as np
import email
import os
from email.parser import Parser
import re
import lxml.html
import collections
import pickle

In [2]:
# extract the body of email
def walkMsg(msg):
    """
    Input: string of email
    Output: body of email
    """
    for part in msg.walk():
        if part.get_content_type() == "multipart/alternative":
            continue
        yield part.get_payload(decode=1)

In [3]:
def robust_decode(bs):
    '''Takes a byte string as param and convert it into a unicode one.
First tries UTF8, and fallback to Latin1 if it fails'''
    cr = None
    try:
        #cr = bs.decode('utf8')
        cr = bs.decode('utf-8-sig')
    except UnicodeDecodeError:
        cr = bs.decode('ISO-8859-1')
        cr = cr.encode("ascii", "ignore")
        #cr = cr.decode('utf-8-sig')
    return cr
#.replace("\n"," ").replace("\t"," ").replace("r"," ")

In [4]:
# extract the information of email
# still need some works to get number of cc and bcc
def info_of_email(path,List_of_files,classification):
    """
    Input: path, list of files name, classification(spam or ham)
    Ourput: a list with info of email
    """
    parser = Parser()
    info = []
    for files in List_of_files:
        emailText =  open(path + files, 'rb').read()
        email = parser.parsestr(emailText)
        if email.get('cc') == None:
            no_cc = 0
        else:
            no_cc = len(email.get('cc'))
        if email.get("bcc") == None:
            no_bcc = 0
        else:
            no_bcc = len(email.get('bcc'))
        Date = email.get("Date")

        Content = " ".join([robust_decode(i) for i in walkMsg(email) if i != None])
        Content = ''.join(Content.splitlines())
        Content = Content.replace("\t","")
        if 'html' in Content:
            Content = lxml.html.fromstring(Content).text_content()
        
        try:
            year = re.search('\d{4}',Date).group(0)
            hour = re.search('\d{2}:\d{2}:\d{2}',Date).group(0).split(":")[0]
            weekday = re.search('Mon|Tue|Wed|Thu|Fri|Sat|Sun',Date).group(0)
            info.append([year, weekday, hour, email.get("From"),email.get('To'), no_cc, no_bcc, 
                         str(email.get("Content-Type")).split(";")[0].lower(),
                         email.get('subject'), Content, classification])
        except:
            info.append([None, None, None, email.get("From"), 
                         email.get('To'), no_cc, no_bcc, 
                         str(email.get("Content-Type")).split(";")[0].lower(),
                         email.get('subject'), Content, classification])
    return info

### Spam

In [5]:
listing = os.listdir('./Spam/')
listing = [i for i in listing if i[-5:] != "Store"]
email_df = pd.DataFrame(info_of_email('./Spam/', listing, 'spam'),
                       columns=['Year','Weekday',"Hour",
                                'From','To','cc','bcc','Content_type','Subjects','Content','Classification'])

In [6]:
email_df.tail()

Unnamed: 0,Year,Weekday,Hour,From,To,cc,bcc,Content_type,Subjects,Content,Classification
31096,2002,Tue,18,"""IQ - TBA"" <tba@insiq.us>",<yyyy@spamassassin.taint.org>,0,0,multipart/alternative,Preferred Non-Smoker Rates for Smokers,Preferred Non-Smoker Just what the doctor ord...,spam
31097,2003,Sun,16,Mike <raye@yahoo.lv>,Mailing.List@user2.pro-ns.net,0,0,text/plain,"How to get 10,000 FREE hits per day to any web...","Dear Subscriber,If I could show you a way to g...",spam
31098,2020,Wed,4,"""Mr. Clean"" <cweqx@dialix.oz.au>",<Undisclosed.Recipients@webnote.net>,0,0,text/plain,Cannabis Difference,****Mid-Summer Customer Appreciation SALE!****...,spam
31099,2002,Wed,6,"""wilsonkamela400@netscape.net"" <wilsonkamela50...",ilug@linux.ie,0,0,text/plain,[ILUG] WILSON KAMELA,ATTN:SIR/MADAN STRI...,spam
31100,2005,Tue,9,"""Chia Patterson"" <pkrebehenne@businessesinport...",projecthoneypot@projecthoneypot.org,0,0,text/html,Just to her...,SOFT Viagra at $1.62 per doseReady to boost yo...,spam


In [7]:
email_df.shape

(31101, 11)

### Ham

In [8]:
path = './Ham/'
direc = os.listdir(path)
direc = [i for i in direc if i[-5:] != "Store"]

In [9]:
file_name = []
for i in direc:
    file_name.extend([path + i + "/" + f for f in os.listdir(path+i)])

In [10]:
ham_df = pd.DataFrame(info_of_email("",file_name,"ham"),
                      columns=['Year','Weekday',"Hour",
                                'From','To','cc','bcc','Content_type','Subjects','Content','Classification'])

In [11]:
ham_df.head()

Unnamed: 0,Year,Weekday,Hour,From,To,cc,bcc,Content_type,Subjects,Content,Classification
0,2001,Sun,12,no.address@enron.com,jeff.donahue@enron.com,24,24,text/plain,Asset Marketing/Corporate Development,"Jeff,As we discussed, attached is a comprehens...",ham
1,2001,Thu,11,no.address@enron.com,"m..presto@enron.com, mitch.robinson@enron.com,...",72,72,text/plain,RE: Conn. Coal Plants,Our research indicates that in 1998 Wisvest pa...,ham
2,2001,Mon,23,don.miller@enron.com,louise.kitchen@enron.com,0,0,text/plain,Peaker Update,"Louise,At your earliest convenience, I need to...",ham
3,2001,Fri,18,don.miller@enron.com,louise.kitchen@enron.com,0,0,text/plain,Revised 1999 and 2000 Peaker Action Lists as o...,"Louise,Attached are the Cinergy and Allegheny ...",ham
4,2001,Wed,12,no.address@enron.com,louise.kitchen@enron.com,0,0,text/plain,2001 Gain Calculation,"Louise,Per your request. Let me know if you h...",ham


In [12]:
ham_df.shape

(18228, 11)

### All Enron Email

In [13]:
enron_email = email_df.append(ham_df)

In [22]:
from collections import Counter
Counter(enron_email['Year'])

Counter({None: 583,
         '0102': 74,
         '1969': 2,
         '1980': 5,
         '1987': 1,
         '1996': 2,
         '1997': 3,
         '1998': 2,
         '1999': 138,
         '2000': 6861,
         '2001': 9512,
         '2002': 3539,
         '2003': 463,
         '2004': 11998,
         '2005': 16053,
         '2006': 17,
         '2007': 1,
         '2008': 1,
         '2009': 1,
         '2010': 1,
         '2020': 3,
         '2038': 68,
         '2548': 1})

### Output the data as pickle file

I choose to output the data in pickle because if output .csv file, we will face difficult when reading it with pd.read_csv

In [17]:
enron_email.to_pickle("enron_email.txt")

In [18]:
enron_pickle = pickle.load(open("enron_email.txt","rb"))

In [19]:
enron_pickle.head()

Unnamed: 0,Year,Weekday,Hour,From,To,cc,bcc,Content_type,Subjects,Content,Classification
0,2004,Sat,12,"""Bertha "" <Denny@mailbox.sk>",rait@bruce-guenter.dyndns.org,0,0,multipart/alternative,Get it up and keep it up praecox,shakedown NEWSLETTER stop NEWS bushel ISSUE ta...,spam
1,2004,Sat,22,"""LZEDEMCNODQBEZQBOV@encryption.com"" <Opal.Cham...",cvs@bruce-guenter.dyndns.org,29,0,multipart/alternative,Winner: $332573,Your [m]ortgage application was approved.You a...,spam
2,2004,Sun,1,Connie Wong <tatypin@huhmail.com>,Rait <rait@bruce-guenter.dyndns.org>,0,0,multipart/alternative,Need software? Click here.,TOP quality software:<br><br><b>Special Offer ...,spam
3,2004,Fri,17,shark@promotions.com <shark@promotions.com>,"""bruceg@em.ca"" <bruceg@em.ca>",0,0,multipart/alternative,Loyalty Bonuses for Shark Casino Players!,Featuring this weekend20/20Buy one and the Sha...,spam
4,2004,Sat,6,"""Important! Search Engine Alert!"" <rockwelldat...","""Bruceg"" <bruceg@em.ca>",0,0,multipart/alternative,Your web site has dropped out of the Search En...,Rockwell Data Corp This Email has been sent to...,spam
