In [1]:
import os
import pandas as pd
import numpy as np
import codecs
import string
import nltk
from nltk.tokenize import RegexpTokenizer
import itertools
import unicodedata
from collections import OrderedDict
import re
from collections import Counter
import tensorflow as tf

In [2]:
# Loading Dataset

loc = '/home/ankan/Projects/RNN-LSTM/enron1'
filelist = os.listdir(loc)

In [3]:
# Regex for replacing website, clocktime, date, alphanumeric, digits

transformed_labels = OrderedDict([
        (" urlname ",
         '(http[s]?://|www)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
        (" date ",
         '([1-2][0-9][0-9][0-9])(.|-)(1[0-2]|0[1-9]|[1-9])(.|-)(2[0-9]|3[0-1]|1[0-9]|0[1-9]|[1-9])'),
        (" clocktime ", '(2[0-3]|1[0-9]|0[0-9]|[0-9]):([0-5][0-9])'),
        (" alphanumeric ", '([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*'),
        (" digit ", '\d+')
    ])

In [5]:
# All ascii letters

all_letters = string.ascii_letters + " .,;'"

In [4]:
# Replacing with Regex

def remove_regex(input_text, regex_pattern, str_replace):
    line = re.sub(regex_pattern, str_replace, input_text)
    return line

In [6]:
def extract_labels(sentence):
    global labels
    for val, pat in transformed_labels.items():
        sentence = remove_regex(sentence, pat, val)
    return sentence

In [7]:
# Converting Unicode to Ascii

def unicodeToAscii(sentence):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sentence)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [8]:
# Making email_list and label_list

data = []
label = []
for d in filelist:
    dir_path = os.path.join(loc,d)
    if not os.path.isfile(dir_path):
        files = [f for f in os.listdir(dir_path)]
        for f in files:
            file_path = os.path.join(loc,d,f)
            with codecs.open(file_path, "r",encoding='utf-8', errors='ignore') as fdata:
                email_msg = unicodeToAscii(extract_labels(fdata.read()))
                data.append(email_msg)
                label.append(d)
    else:
        continue
df = pd.DataFrame({'email': data,'label': label})

In [9]:
df.head(10)

Unnamed: 0,email,label
0,Subject best prices for impotence drugsone tim...,spam
1,Subject can we go over guillermo ' s budget to...,spam
2,Subject let the euro make you money,spam
3,Subject your investor communiqup digit get ab...,spam
4,Subject hp pavilion v digit digit crt mon...,spam
5,"Subject paliourg , best medswakey wakey to sa...",spam
6,Subject hi paliourg get all pills . everything...,spam
7,Subject re digit,spam
8,"Subject vicodin , via gra are che . ap here a...",spam
9,Subject full stock of all your p harmacy need...,spam


In [10]:
df.tail(10)

Unnamed: 0,email,label
5162,Subject enron nominations for november digit ...,ham
5163,"Subject megan jonessteve daren ,i interviewed...",ham
5164,Subject meter variances ua digit clean upd...,ham
5165,"Subject hpl nom for may digit , digit see ...",ham
5166,Subject cornhuskeri have entered deals into si...,ham
5167,"Subject hpl nom for april digit , digit se...",ham
5168,Subject digit th noms f...,ham
5169,Subject nomination eastrans digit digit...,ham
5170,Subject eastrans lst of month nomination eff...,ham
5171,Subject mobil beaumont fyifyi i ' ve entere...,ham


In [11]:
# Removing punctuation from sentences

tokenizer = RegexpTokenizer(r'\w+')
processed_emails = []
email_words = []
list_of_all_words = []
for index, row in df.iterrows():
    email = row['email'].lower()
    email = tokenizer.tokenize(email)
    email_words.append(email)

# List of all Words

list_of_all_words = list(itertools.chain.from_iterable(email_words))

In [12]:
# Dictionary of vocabulary and their count

counts = Counter(list_of_all_words)
vocab = sorted(counts, key=counts.get, reverse = True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab)}

In [13]:
# Number of words in the Vocabulary

len(vocab_to_int)

65163

In [14]:
# Sentences in the form of their vocabulary counts

email_integers = []
for each in email_words:
    email_integers.append([vocab_to_int[word] for word in each])

In [15]:
email_lengths = Counter([len(x) for x in email_integers])

# Zero Length Sentences

print("len_zero: {}".format(email_lengths[0]))

# Maximum Length of the Sentence

print("len_max: {}".format(max(email_lengths)))

len_zero: 0
len_max: 3559


In [16]:
# Making each sentences of the length 3559 by left padding with zeros

length = 3559
words = np.zeros((len(email_integers), length), dtype=int)
for i, row in enumerate(email_integers):
    words[i, -len(row):] = np.array(row)[:length]

In [18]:
# Making labels into binary 0 and 1

labels_int = np.array([1 if each == 'spam' else 0 for each in label])

In [19]:
# Shuffling Our Dataset

s = np.arange(labels_int.shape[0])
np.random.shuffle(s)
words = words[s]
labels_int = labels_int[s]

In [20]:
# Splitting into Train, Test and Validation Sets

split_idx = int(len(words)*0.8)
train_x, val_x = words[:split_idx], words[split_idx:]
train_y, val_y = labels_int[:split_idx], labels_int[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(4137, 3559) 
Validation set: 	(517, 3559) 
Test set: 		(518, 3559)


In [30]:
# Hyperparameters For RNN_LSTM

lstm_size = 256
lstm_layers = 2
batch_size = 10
learning_rate = 0.001
drop_out = 0.5
epochs = 2

In [22]:
n_words = len(vocab_to_int)

# Create the graph object

graph = tf.Graph()

# Add nodes to the graph

with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [23]:
# Converting Words to Word Embeddings

embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [24]:
# Making our LSTM cell

with graph.as_default():
    def lstm_cell():
        cell = tf.contrib.rnn.LSTMCell(lstm_size, 
                                       initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2),
                                       state_is_tuple=True)
        drop = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return drop
    
    stack_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    initial_state = state = stack_cells.zero_state(batch_size, tf.float32)

In [25]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(stack_cells, embed, initial_state=initial_state)

In [26]:
# Loss Function and Optimization Function

with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [27]:
# Evaluation and Accuracy

with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [31]:
# Batching

def get_batches(x, y, batch_size=50):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [32]:
# Training the RNN_LSTM Model

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: drop_out,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(stack_cells.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/spam_ham.ckpt")

Epoch: 0/2 Iteration: 5 Train loss: 0.151
Epoch: 0/2 Iteration: 10 Train loss: 0.244
Epoch: 0/2 Iteration: 15 Train loss: 0.167
Epoch: 0/2 Iteration: 20 Train loss: 0.233
Epoch: 0/2 Iteration: 25 Train loss: 0.208
Val acc: 0.784
Epoch: 0/2 Iteration: 30 Train loss: 0.118
Epoch: 0/2 Iteration: 35 Train loss: 0.167
Epoch: 0/2 Iteration: 40 Train loss: 0.158
Epoch: 0/2 Iteration: 45 Train loss: 0.151
Epoch: 0/2 Iteration: 50 Train loss: 0.125
Val acc: 0.810
Epoch: 0/2 Iteration: 55 Train loss: 0.089
Epoch: 0/2 Iteration: 60 Train loss: 0.136
Epoch: 0/2 Iteration: 65 Train loss: 0.082
Epoch: 0/2 Iteration: 70 Train loss: 0.120
Epoch: 0/2 Iteration: 75 Train loss: 0.215
Val acc: 0.851
Epoch: 0/2 Iteration: 80 Train loss: 0.159
Epoch: 0/2 Iteration: 85 Train loss: 0.048
Epoch: 0/2 Iteration: 90 Train loss: 0.260
Epoch: 0/2 Iteration: 95 Train loss: 0.041
Epoch: 0/2 Iteration: 100 Train loss: 0.020
Val acc: 0.902
Epoch: 0/2 Iteration: 105 Train loss: 0.036
Epoch: 0/2 Iteration: 110 Train loss

In [35]:
# Testing the RNN_LSTM Model

test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(stack_cells.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints_try/spam_ham.ckpt
Test accuracy: 0.959
