In [71]:
import os
import pandas as pd
import numpy as np
import codecs
import string
import nltk
from nltk.tokenize import RegexpTokenizer
import itertools
import unicodedata
from collections import OrderedDict
import re
from collections import Counter

In [38]:
loc = '/home/ankan/Projects/RNN-LSTM/enron1'
filelist = os.listdir(loc)

In [40]:
transformed_labels = OrderedDict([
        (" urlname ",
         '(http[s]?://|www)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
        (" date ",
         '([1-2][0-9][0-9][0-9])(.|-)(1[0-2]|0[1-9]|[1-9])(.|-)(2[0-9]|3[0-1]|1[0-9]|0[1-9]|[1-9])'),
        (" clocktime ", '(2[0-3]|1[0-9]|0[0-9]|[0-9]):([0-5][0-9])'),
        (" alphanumeric ", '([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*'),
        (" digit ", '\d+')
    ])

In [4]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [42]:
def remove_regex(input_text, regex_pattern, str_replace):
    line = re.sub(regex_pattern, str_replace, input_text)
    return line

In [43]:
def extract_labels(sentence):
    global labels
    for val, pat in transformed_labels.items():
        sentence = remove_regex(sentence, pat, val)
    return sentence

In [44]:
def unicodeToAscii(sentence):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sentence)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [45]:
data = []
label = []
for d in filelist:
    dir_path = os.path.join(loc,d)
    if not os.path.isfile(dir_path):
        files = [f for f in os.listdir(dir_path)]
        for f in files:
            file_path = os.path.join(loc,d,f)
            with codecs.open(file_path, "r",encoding='utf-8', errors='ignore') as fdata:
                email_msg = unicodeToAscii(extract_labels(fdata.read()))
                #email_msg = unicodeToAscii(fdata.read())
                #data.append(fdata.read())
                #print(email_msg)
                data.append(email_msg)
                label.append(d)
    else:
        continue
df = pd.DataFrame({'email': data,'label': label})
df.to_csv("/home/ankan/Projects/RNN-LSTM/email_data.csv", encoding='utf-8', index=False)

In [21]:
df.head(10)

Unnamed: 0,email,label
0,Subject best prices for impotence drugsone tim...,spam
1,Subject can we go over guillermo ' s budget to...,spam
2,Subject let the euro make you money,spam
3,Subject your investor communiqup get abzt firs...,spam
4,Subject hp pavilion v crt monitor w satell...,spam
5,"Subject paliourg , best medswakey wakey to sa...",spam
6,Subject hi paliourg get all pills . everything...,spam
7,Subject re,spam
8,"Subject vicodin , via gra are che . ap here a...",spam
9,Subject full stock of all your p harmacy need...,spam


In [22]:
df.tail(10)

Unnamed: 0,email,label
5162,"Subject enron nominations for november , see...",ham
5163,"Subject megan jonessteve daren ,i interviewed...",ham
5164,Subject meter variances ua clean updaren v...,ham
5165,"Subject hpl nom for may , see attached file ...",ham
5166,Subject cornhuskeri have entered deals into si...,ham
5167,"Subject hpl nom for april , see attached fil...",ham
5168,Subject th noms forwarde...,ham
5169,Subject nomination eastrans and the no...,ham
5170,Subject eastrans lst of month nomination eff...,ham
5171,Subject mobil beaumont fyifyi i ' ve entere...,ham


In [60]:
tokenizer = RegexpTokenizer(r'\w+')
processed_emails = []
email_words = []
list_of_all_words = []
for index, row in df.iterrows():
    email = row['email'].lower()
    email = tokenizer.tokenize(email)
    whole_email = ' '.join(word for word in email)
    processed_emails.append(whole_email)
    email_words.append(email)
df_processed_emails = pd.DataFrame({'processed_emails': processed_emails})
list_of_all_words = list(itertools.chain.from_iterable(email_words))
vocabulary = sorted(list(set(list_of_all_words)))
df_vocabulary = pd.DataFrame({'vocabulary': vocabulary})
df_cleaned = pd.DataFrame({'email': processed_emails,'label': label})
df_cleaned.to_csv("/home/ankan/Projects/RNN-LSTM/email_data_cleaned.csv", encoding='utf-8', index=False)

In [61]:
counts = Counter(list_of_all_words)
vocab = sorted(counts, key=counts.get, reverse = True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab)}

In [62]:
vocab_to_int

{'digit': 0,
 'the': 1,
 'to': 2,
 'ect': 3,
 'and': 4,
 'for': 5,
 'of': 6,
 'a': 7,
 'you': 8,
 'hou': 9,
 'in': 10,
 'on': 11,
 'is': 12,
 'subject': 13,
 'this': 14,
 'i': 15,
 'enron': 16,
 'be': 17,
 'that': 18,
 'we': 19,
 'from': 20,
 'will': 21,
 'have': 22,
 'your': 23,
 'with': 24,
 'at': 25,
 's': 26,
 'it': 27,
 'are': 28,
 'com': 29,
 'as': 30,
 'if': 31,
 'by': 32,
 'gas': 33,
 'or': 34,
 'please': 35,
 'not': 36,
 'deal': 37,
 'me': 38,
 'meter': 39,
 'hpl': 40,
 're': 41,
 'd': 42,
 'our': 43,
 'e': 44,
 'can': 45,
 'any': 46,
 'all': 47,
 'corp': 48,
 'thanks': 49,
 'has': 50,
 'was': 51,
 'know': 52,
 'daren': 53,
 'ectcc': 54,
 'need': 55,
 'am': 56,
 'an': 57,
 'pmto': 58,
 'new': 59,
 't': 60,
 'may': 61,
 'mmbtu': 62,
 'forwarded': 63,
 'up': 64,
 'j': 65,
 'do': 66,
 'should': 67,
 'no': 68,
 'get': 69,
 'there': 70,
 'see': 71,
 'http': 72,
 'amto': 73,
 'out': 74,
 'let': 75,
 'farmer': 76,
 'price': 77,
 'but': 78,
 'company': 79,
 'these': 80,
 'xls': 81,
 '

In [68]:
email_integers = []
for each in email_words:
    email_integers.append([vocab_to_int[word] for word in each])

In [69]:
from collections import Counter
email_lengths = Counter([len(x) for x in email_ints])
print("len_zero: {}".format(email_lengths[0]))
print("len_max: {}".format(max(email_lengths)))

len_zero: 0
len_max: 3559


In [83]:
length = 3559
words = np.zeros((len(email_integers), length), dtype=int)
for i, row in enumerate(email_integers):
    words[i, -len(row):] = np.array(row)[:length]

In [84]:
labels_int = np.array([1 if each == 'spam' else 0 for each in label])
labels_int

array([1, 1, 1, ..., 0, 0, 0])

In [90]:
split_frac = 0.90
split_index = int(len(words)*0.90)
train_x, test_x = words[:split_index], words[split_index:]
train_y, test_y = labels_int[:split_index], labels_int[split_index:]

print("Train set: {}".format(train_x.shape), 
      "\nTest set: {}".format(test_x.shape))

Train set: (4654, 3559) 
Test set: (518, 3559)


In [13]:
df = pd.concat([df, df_processed_emails], axis=1)

In [14]:
df.head(5)

Unnamed: 0,email,label,processed_emails
0,Subject best prices for impotence drugsone tim...,spam,"[subject, best, prices, for, impotence, drugso..."
1,Subject can we go over guillermo ' s budget to...,spam,"[subject, can, we, go, over, guillermo, s, bud..."
2,Subject let the euro make you money,spam,"[subject, let, the, euro, make, you, money]"
3,Subject your investor communiqup digit get ab...,spam,"[subject, your, investor, communiqup, digit, g..."
4,Subject hp pavilion v digit digit crt mon...,spam,"[subject, hp, pavilion, v, digit, digit, crt, ..."


In [15]:
df_vocabulary.head(10)

Unnamed: 0,vocabulary
0,a
1,aa
2,aaa
3,aaas
4,aabda
5,aac
6,aachecar
7,aacheck
8,aadelivered
9,aaer


In [16]:
df_vocabulary.tail(10)

Unnamed: 0,vocabulary
65153,zynve
65154,zyqtaqlt
65155,zyrtecantibiotic
65156,zyyqywp
65157,zzezrjok
65158,zzn
65159,zzo
65160,zzocb
65161,zzso
65162,zzsyt
