In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import gc
from bs4 import BeautifulSoup
import nltk
import pickle

from sklearn.feature_extraction.text import CountVectorizer


np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.regularizers import l2, activity_l2
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.utils.visualize_util import plot
#from nltk.corpus import stopwords # Import the stop word list #nltk package for nlp pre-processing


Using Theano backend.


##### Import raw input data into the Dataframe "df"

In [2]:
print('Current working directory is: ', os.getcwd())
print('List of file in directory:\n', os.listdir(os.getcwd()))

('Current working directory is: ', '/Users/Armin/Documents/Deep_Learning/spam_detection')
('List of file in directory:\n', ['.DS_Store', '.git', '.gitignore', '.ipynb_checkpoints', 'bag_of_words_features.pkl', 'data', 'model.png', 'model_loaded.png', 'my_model_architecture.json', 'my_model_weights.h5', 'prediction.ipynb', 'training.ipynb'])


In [3]:
Input_text = pd.read_csv('./data/smsspamcollection.csv', sep='~~', engine='python', header=None)
df = pd.DataFrame(Input_text)
df.columns = ['Flag','Text']

##### Unblock this blok to import NLTK for further NLP pre-processing

In [4]:
#nltk.download()
#print stopwords.words("english") 

##### Pre-processing input

In [5]:
# Get the number of reviews based on the dataframe column size
input_data_size = Input_text["Text"].size
print ("Input data size is:", input_data_size)

('Input data size is:', 5568)


In [6]:
def review_to_words( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string , and 
    # the output is a single string
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_text, "lxml").get_text() 
    #
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = review_text.lower().split()                            
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( words ))   

In [7]:
# Initialize an empty list to hold the clean input text
clean_training_set = []
num_inputs = Input_text["Text"].size

In [8]:
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in xrange( 0, input_data_size ):
    # Call our function for each one, and add the result to the list of
    # clean texts
    # Following lines give the status of text cleaning
    if( (i+1)%1000 == 0 ):
        print "Message %d of %d\n" % ( i+1, num_inputs )  # Monitoring progress     
    clean_training_set.append( review_to_words( Input_text["Text"][i] ) )
print "The of training set is %d\n" % (len(clean_training_set))

Message 1000 of 5568

Message 2000 of 5568

Message 3000 of 5568

Message 4000 of 5568

Message 5000 of 5568

The of training set is 5568



In [9]:
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 10000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_training_set)
# Save vectorizer for later prediction
pickle.dump(vectorizer.vocabulary_,open("bag_of_words_features.pkl","wb"))

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print "Done!! Created the bag of words.\n"

Creating the bag of words...

Done!! Created the bag of words.



In [10]:
label_vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_labels = label_vectorizer.fit_transform(Input_text["Flag"])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_labels = train_data_labels.toarray()
print "Done!! Created Flags.\n"
lab = ['spam']
lab2 = label_vectorizer.fit_transform(lab)
print train_data_labels

Done!! Created Flags.

[[1]
 [1]
 [0]
 ..., 
 [1]
 [1]
 [1]]


##### Setup the training and test inputs

In [12]:
m = 4000 # m determines the number of the data points to be used for training the model
Y_train = np.asarray(train_data_labels[0:m])
X_train = np.asarray(train_data_features[0:m])
Y_test = np.asarray(train_data_labels[m:])
X_test = np.asarray(train_data_features[m:])
#X_submission = np.asarray(train_data_features[])
# convert class vectors to binary class matrices
#Y_train = np_utils.to_categorical(Y_train, 2)
#Y_test = np_utils.to_categorical(Y_test, 2)

print "The test X is of shape:",X_test.shape
print "The test Y is of shape:",Y_test.shape
#print "The Submission set is of shape:",X_submission.shape

 The test X is of shape: (1568, 8703)
The test Y is of shape: (1568, 1)


##### Setup the model global parameters

In [13]:
# Embedding
max_features = 20000
maxlen = 200
embedding_size = 128

# Convolution
filter_length = 10
nb_filter = 64
pool_length = 2

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
nb_epoch = 2

# 
output_size = nb_filter * (((maxlen - filter_length) / 1) + 1) / 2

##### Padding the input sentences to maxlen

In [14]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
('X_train shape:', (4000, 200))
('X_test shape:', (1568, 200))


##### Setup the architecture

In [15]:
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('softmax'))

#####  Setup the optimizer 

In [24]:
model.compile(loss='binary_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

##### Begin Training

In [25]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          validation_data=(X_test, Y_test), shuffle=True)
score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 4000 samples, validate on 1568 samples
Epoch 1/2
Epoch 2/2
('Test score:', 2.1554756018580221)
('Test accuracy:', 0.86479591836734693)


##### Save model architecture to an image file

In [26]:
plot(model, to_file='model.png', show_shapes = True)

##### Save model parameters to retireve for prediction

In [27]:
print model.summary()

json_string = model.to_json()
open('my_model_architecture.json', 'w').write(json_string)
model.save_weights('my_model_weights.h5', overwrite=True)

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
embedding_1 (Embedding)            (None, 200, 128)    2560000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)    (None, 191, 64)     81984       embedding_1[0][0]                
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)      (None, 95, 64)      0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
lstm_1 (LSTM)                      (None, 70)          37800       maxpooling1d_1[0][0]             
___________________________________________________________________________________________

In [30]:
prediction_results = model.predict_classes(X_train, verbose=0)

In [31]:
print np.sum(prediction_results)

4000
