In [1]:
##set up
import numpy as np
import pandas as pd
import string
import email
import os ##use this to traverse folders
from email.parser import Parser
import re
import lxml.html
import collections

In [2]:
#extract labels from trec06p/full/ file lists
with open('trec06p/full/index', 'r') as ifile:
    raw_labels = ifile.readlines()

#using file list to give labels and e-mail numbers

categories = [i.split( )[0] for i in raw_labels]
email_nums = [i.split(".")[-1][:-1] for i in raw_labels]
labels = pd.Series(categories, index=email_nums)

In [3]:
# extract the body of email
def walkMsg(msg):
    """
    Input: string of email
    Output: body of email
    """
    for part in msg.walk():
        if part.get_content_type() == "multipart/alternative":
            continue
        yield part.get_payload(decode=1)

In [4]:
##encode
def robust_decode(bs):
    '''Takes a byte string as param and convert it into a unicode one.
First tries UTF8, and fallback to Latin1 if it fails'''
    cr = None
    try:
        cr = bs.decode('unicode-escape')
    except:
        try:
            cr = bs.decode('utf-8-sig')
        except UnicodeDecodeError:
            cr = bs.decode('ISO-8859-1')
            cr = cr.encode("ascii", "ignore")
    return cr

In [5]:
def info_of_email_2():
    """
    Input: path, list of files name, classification(spam or ham)
    Ourput: a list with info of email
    """
    
    parser = Parser()
    info = []
    for nums in range(len(categories)):
        filepath = 'trec06p%s' % email_nums[nums]
                
        with open(filepath, 'r') as ifile:
            email = parser.parse(ifile)
            
            Date = email.get("Date")

            Content = " ".join([robust_decode(i) for i in walkMsg(email) if i != None])
            Content = ''.join(Content.splitlines())
            Content = Content.replace("\t","")
            
            if 'html' in Content:
                try:
                    Content = lxml.html.fromstring(Content).text_content()
                except: 
                    Content = Content
            try: 
                year = re.search('\d{4}',Date).group(0)
            except:
                year = None
            try:
                hour = re.search('\d{2}:\d{2}:\d{2}',Date).group(0).split(":")[0]
            except:
                hour = None
            try: 
                weekday = re.search('Mon|Tue|Wed|Thu|Fri|Sat|Sun',Date).group(0)
            except: 
                weekday =None
            try:
                month = re.search('Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept',Date).group(0)
            except:
                month =None
            info.append([year, weekday, hour, month,
                         email.get("From"),email.get('To'),
                         email.get('cc'),email.get('bcc'), 
                         str(email.get("Content-Type")).split(";")[0].lower(),
                         email.get('subject'),
                         Content,
                         categories[nums]])
    return info

In [6]:
df = pd.DataFrame(info_of_email_2(),
                 columns=['Year','Weekday',"Hour","Month",
                          'From','To','cc','bcc','Content_type','Subjects','Content','Classification'])

In [7]:
df.shape

(37822, 12)

In [8]:
df06 = df.loc[df.Year.isin(['1999','2000','2001','2002','2003','2004','2005','2006'])]

In [9]:
df06.shape

(25370, 12)

### Output

In [11]:
df06.to_pickle("df2006_v2.txt")

In [10]:
collections.Counter(df06.Year)

Counter({'1999': 7440,
         '2000': 4456,
         '2001': 1108,
         '2002': 1729,
         '2003': 1583,
         '2004': 1827,
         '2005': 3404,
         '2006': 3823})

### Note: The rests are testing data 

df[df.Year == "1999"]

In [32]:
df.iloc[[len(i)!=4 for i in df['Year']]]

Unnamed: 0,Year,Month,Day,Weekday,Hour,From,To,cc,bcc,Content_type,Subjects,Content,Classification
29,[],[],[],[],[],=?iso-2022-jp?B?d3JxZA==?=<qdrq@yahoo.co.hk>,<winnie@groucho.cs.psu.edu>,,,text/plain,=?iso-2022-jp?B?GyRCOSUkKyRsJGs/TSFaJVslQyVIJW...,$B=w$N;R$+$iCK$N;R$X%*%9%9%a$9$k%J%$%9$JJ8=qH...,spam
55,[],[],[],[],[],=?iso-2022-jp?B?d3JxZA==?=<qdrq@yahoo.co.hk>,<winnie@groucho.cs.psu.edu>,,,text/plain,=?iso-2022-jp?B?GyRCOSUkKyRsJGs/TSFaJVslQyVIJW...,$B=w$N;R$+$iCK$N;R$X%*%9%9%a$9$k%J%$%9$JJ8=qH...,spam
67,[],[],[],[],[],"""houndog42"" <tgfyneucbjy@hotmail.com>",rosalie@groucho.cse.psu.edu,,,text/html,After Penis Enlarge Patch you cock won�t resem...,99 percent of <ach></ach>all men have a weaker...,spam
91,[],[],[],[],[],"""houndog42"" <tgfyneucbjy@hotmail.com>",rosalie@groucho.cse.psu.edu,,,text/html,After Penis Enlarge Patch you cock won�t resem...,99 percent of <ach></ach>all men have a weaker...,spam
105,[],[],[],[],[],=?iso-2022-jp?B?Y2NieQ==?=<ccby@hotmail.com>,<flossie@groucho.cse.psu.edu>,,,text/plain,=?iso-2022-jp?B?GyRCRTdBMyRBJGMkcyUyJUMlSBsoQg...,$B#G#W$O$3$&$$$&=w$N;R$H2a$4$7$?$$$C$9!#(Bht...,spam
112,Jul,21,"Wed,",Two,1993,"""Marcus Samuel"" <comments@100-free-sex-picture...",jillian@groucho.cse.psu.edu,,,text/plain,fwd: The hottest pick daily news,"Get CTXE First Thing Today, This Is Going To E...",spam
116,[],[],[],[],[],=?iso-2022-jp?B?eXVraWhhbmE=?=<yukihana@yahoo....,<sherri@groucho.cse.psu.edu>,,,text/plain,=?iso-2022-jp?B?UmU6?=,$B40(-A4(-L5(-NA(-!u(-;H(-$$(-J|(-Bj(-(B$B(...,spam
123,[],[],[],[],[],=?iso-2022-jp?B?Y2NieQ==?=<ccby@hotmail.com>,<flossie@groucho.cse.psu.edu>,,,text/plain,=?iso-2022-jp?B?GyRCRTdBMyRBJGMkcyUyJUMlSBsoQg...,$B#G#W$O$3$&$$$&=w$N;R$H2a$4$7$?$$$C$9!#(Bht...,spam
132,93,Jul,29,Thu,07,stassen@alc-ohio.alc.com (Chris Stassen),christian@geneva.rutgers.edu,,,,Re: reading list wanted,Enclosed is Mr. Buxton's proposed creationist ...,ham
145,16:40:34,1993,Aug,3,GMT,huggins@quip.eecs.umich.edu (Jim Huggins),soc-religion-christian@uunet.uu.net,,,,Re: NEEDED: Bible Study Resource(s) on Creatio...,In article <Jul.28.02.36.45.1993.6248@geneva.r...,ham


In [10]:
filepath = 'trec06p%s' % email_nums[9293]
parser = Parser()
with open(filepath, 'r') as ifile:
    email = parser.parse(ifile)
print(email)

From nobody Wed May 31 11:02:36 2017
Received: from media.mit.edu (unknown [80.71.216.37]) by aleve.media.mit.edu
 (8.9.1a/8.9.1/+ALEVE)
 with ESMTP id MAA14280 for <handyboard@media.mit.edu>;
 Fri, 1 Jan 1999 06:06:31 -0500 (EST)
Received: from dogtrot.wilson.ac.jp (exhibit.attune.ac.jp)
 by babysit.tunic.ac.jp (8.11.4/8.11.0av) with ESMTP id g6719Nl6362;
 Fri, 1 Jan 1999 00:03:19 -0700
Message-ID: <mbhxhfzmq@eos.ocn.ne.jp.pt>
Date: Fri, 1 Jan 1999 11:03:19 +0400
From: "Timothy Stephenson" <mbhxhfzmq@eos.ocn.ne.jp>
To: handyboard@media.mit.edu, earline@media.mit.edu, karyn@media.mit.edu,
 sofia@media.mit.edu, eddie@media.mit.edu, janelle@media.mit.edu,
 cherie@media.mit.edu, juan@media.mit.edu, eddie@media.mit.edu
Subject: Boosts Energy
MIME-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 8bit

Summer is coming, did you look in the mirror lately?

If you`re still overweight you MUST visit us:

http://051.doghealthok.com

WE WILL HELP

In [11]:
Date = email.get('Date')
print(Date)

Fri, 1 Jan 1999 11:03:19 +0400


In [12]:
print email

From nobody Wed May 31 11:02:44 2017
Received: from media.mit.edu (unknown [80.71.216.37]) by aleve.media.mit.edu
 (8.9.1a/8.9.1/+ALEVE)
 with ESMTP id MAA14280 for <handyboard@media.mit.edu>;
 Fri, 1 Jan 1999 06:06:31 -0500 (EST)
Received: from dogtrot.wilson.ac.jp (exhibit.attune.ac.jp)
 by babysit.tunic.ac.jp (8.11.4/8.11.0av) with ESMTP id g6719Nl6362;
 Fri, 1 Jan 1999 00:03:19 -0700
Message-ID: <mbhxhfzmq@eos.ocn.ne.jp.pt>
Date: Fri, 1 Jan 1999 11:03:19 +0400
From: "Timothy Stephenson" <mbhxhfzmq@eos.ocn.ne.jp>
To: handyboard@media.mit.edu, earline@media.mit.edu, karyn@media.mit.edu,
 sofia@media.mit.edu, eddie@media.mit.edu, janelle@media.mit.edu,
 cherie@media.mit.edu, juan@media.mit.edu, eddie@media.mit.edu
Subject: Boosts Energy
MIME-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 8bit

Summer is coming, did you look in the mirror lately?

If you`re still overweight you MUST visit us:

http://051.doghealthok.com

WE WILL HELP

In [13]:
### Output
df.to_pickle("df2006.txt")