In [1]:
import os
import pandas as pd
import codecs
import string
import nltk
from nltk.tokenize import RegexpTokenizer
import itertools
import unicodedata
from collections import OrderedDict
import re

In [2]:
loc = '/home/ankan/Projects/RNN-LSTM/enron1'
filelist = os.listdir(loc)

In [3]:
transformed_labels = OrderedDict([
        (" urlname ",
         '(http[s]?://|www)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
        (" date ",
         '([1-2][0-9][0-9][0-9])(.|-)(1[0-2]|0[1-9]|[1-9])(.|-)(2[0-9]|3[0-1]|1[0-9]|0[1-9]|[1-9])'),
        (" clocktime ", '(2[0-3]|1[0-9]|0[0-9]|[0-9]):([0-5][0-9])'),
        (" alphanumeric ", '([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*'),
        (" digit ", '\d+')
    ])

In [4]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [5]:
def remove_regex(input_text, regex_pattern, str_replace):
    line = re.sub(regex_pattern, str_replace, input_text)
    return line

In [6]:
def extract_labels(sentence):
    global labels
    for val, pat in transformed_labels.items():
        sentence = remove_regex(sentence, pat, val)
    return sentence

In [7]:
def unicodeToAscii(sentence):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sentence)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [8]:
data = []
label = []
for d in filelist:
    dir_path = os.path.join(loc,d)
    if not os.path.isfile(dir_path):
        files = [f for f in os.listdir(dir_path)]
        for f in files:
            file_path = os.path.join(loc,d,f)
            with codecs.open(file_path, "r",encoding='utf-8', errors='ignore') as fdata:
                email_msg = unicodeToAscii(extract_labels(fdata.read()))
                #data.append(fdata.read())
                #print(email_msg)
                data.append(email_msg)
                label.append(d)
    else:
        continue
df = pd.DataFrame({'email': data,'label': label})


In [9]:
df.head(10)

Unnamed: 0,email,label
0,Subject best prices for impotence drugsone tim...,spam
1,Subject can we go over guillermo ' s budget to...,spam
2,Subject let the euro make you money,spam
3,Subject your investor communiqup digit get ab...,spam
4,Subject hp pavilion v digit digit crt mon...,spam
5,"Subject paliourg , best medswakey wakey to sa...",spam
6,Subject hi paliourg get all pills . everything...,spam
7,Subject re digit,spam
8,"Subject vicodin , via gra are che . ap here a...",spam
9,Subject full stock of all your p harmacy need...,spam


In [10]:
df.tail(10)

Unnamed: 0,email,label
5162,Subject enron nominations for november digit ...,ham
5163,"Subject megan jonessteve daren ,i interviewed...",ham
5164,Subject meter variances ua digit clean upd...,ham
5165,"Subject hpl nom for may digit , digit see ...",ham
5166,Subject cornhuskeri have entered deals into si...,ham
5167,"Subject hpl nom for april digit , digit se...",ham
5168,Subject digit th noms f...,ham
5169,Subject nomination eastrans digit digit...,ham
5170,Subject eastrans lst of month nomination eff...,ham
5171,Subject mobil beaumont fyifyi i ' ve entere...,ham


In [11]:
tokenizer = RegexpTokenizer(r'\w+')
processed_emails = []
for index, row in df.iterrows():
    email = row['email'].lower()
    email = tokenizer.tokenize(email)
    processed_emails.append(email)
df_processed_emails = pd.DataFrame({'processed_emails': processed_emails})
list_of_all_words = list(itertools.chain.from_iterable(processed_emails))
vocabulary = sorted(list(set(list_of_all_words)))
df_vocabulary = pd.DataFrame({'vocabulary': vocabulary})

In [12]:
df_processed_emails.head(10)

Unnamed: 0,processed_emails
0,"[subject, best, prices, for, impotence, drugso..."
1,"[subject, can, we, go, over, guillermo, s, bud..."
2,"[subject, let, the, euro, make, you, money]"
3,"[subject, your, investor, communiqup, digit, g..."
4,"[subject, hp, pavilion, v, digit, digit, crt, ..."
5,"[subject, paliourg, best, medswakey, wakey, to..."
6,"[subject, hi, paliourg, get, all, pills, every..."
7,"[subject, re, digit]"
8,"[subject, vicodin, via, gra, are, che, ap, her..."
9,"[subject, full, stock, of, all, your, p, harma..."


In [13]:
df = pd.concat([df, df_processed_emails], axis=1)

In [14]:
df.head(5)

Unnamed: 0,email,label,processed_emails
0,Subject best prices for impotence drugsone tim...,spam,"[subject, best, prices, for, impotence, drugso..."
1,Subject can we go over guillermo ' s budget to...,spam,"[subject, can, we, go, over, guillermo, s, bud..."
2,Subject let the euro make you money,spam,"[subject, let, the, euro, make, you, money]"
3,Subject your investor communiqup digit get ab...,spam,"[subject, your, investor, communiqup, digit, g..."
4,Subject hp pavilion v digit digit crt mon...,spam,"[subject, hp, pavilion, v, digit, digit, crt, ..."


In [15]:
df_vocabulary.head(10)

Unnamed: 0,vocabulary
0,a
1,aa
2,aaa
3,aaas
4,aabda
5,aac
6,aachecar
7,aacheck
8,aadelivered
9,aaer


In [16]:
df_vocabulary.tail(10)

Unnamed: 0,vocabulary
65153,zynve
65154,zyqtaqlt
65155,zyrtecantibiotic
65156,zyyqywp
65157,zzezrjok
65158,zzn
65159,zzo
65160,zzocb
65161,zzso
65162,zzsyt
