In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import gc
from bs4 import BeautifulSoup
import nltk
import pickle

from sklearn.feature_extraction.text import CountVectorizer


np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.regularizers import l2, activity_l2
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.utils.visualize_util import plot
#from nltk.corpus import stopwords # Import the stop word list #nltk package for nlp pre-processing


Using Theano backend.


##### Import raw input data into the Dataframe "df"

In [2]:
print('Current working directory is: ', os.getcwd())
print('List of file in directory:\n', os.listdir(os.getcwd()))

('Current working directory is: ', '/Users/Armin/Documents/Deep_Learning/spam_detection')
('List of file in directory:\n', ['.DS_Store', '.git', '.gitignore', '.ipynb_checkpoints', 'bag_of_words_features.pkl', 'data', 'model.png', 'model_loaded.png', 'my_model_architecture.json', 'my_model_weights.h5', 'prediction.ipynb', 'training.ipynb'])


In [3]:
Input_text = pd.read_csv('./data/smsspamcollection.csv', sep='~~', engine='python', header=None)
df = pd.DataFrame(Input_text)
df.columns = ['Flag','Text']

##### Unblock this blok to import NLTK for further NLP pre-processing

In [4]:
#nltk.download()
#print stopwords.words("english") 

##### Pre-processing input

In [5]:
# Get the number of reviews based on the dataframe column size
input_data_size = Input_text["Text"].size
print ("Input data size is:", input_data_size)

('Input data size is:', 5568)


In [6]:
def review_to_words( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string , and 
    # the output is a single string
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_text, "lxml").get_text() 
    #
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = review_text.lower().split()                            
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( words ))   

In [7]:
# Initialize an empty list to hold the clean input text
clean_training_set = []
num_inputs = Input_text["Text"].size

In [8]:
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in xrange( 0, input_data_size ):
    # Call our function for each one, and add the result to the list of
    # clean texts
    # Following lines give the status of text cleaning
    if( (i+1)%1000 == 0 ):
        print "Message %d of %d\n" % ( i+1, num_inputs )  # Monitoring progress     
    clean_training_set.append( review_to_words( Input_text["Text"][i] ) )
print "The of training set is %d\n" % (len(clean_training_set))

Message 1000 of 5568

Message 2000 of 5568

Message 3000 of 5568

Message 4000 of 5568

Message 5000 of 5568

The of training set is 5568



In [9]:
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 10000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_training_set)
# Save vectorizer for later prediction
pickle.dump(vectorizer.vocabulary_,open("bag_of_words_features.pkl","wb"))

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print "Done!! Created the bag of words.\n"

Creating the bag of words...

Done!! Created the bag of words.



In [10]:
label_vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 2) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_labels = label_vectorizer.fit_transform(Input_text["Flag"])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_labels = train_data_labels.toarray()
print "Done!! Created Flags.\n"
lab = ['spam']
lab2 = label_vectorizer.fit_transform(lab)

Done!! Created Flags.



##### Setup the training and test inputs

In [11]:
m = 4000 # m determines the number of the data points to be used for training the model
Y_train = np.asarray(train_data_labels[0:m])
X_train = np.asarray(train_data_features[0:m])
Y_test = np.asarray(train_data_labels[m:m+1000])
X_test = np.asarray(train_data_features[m:m+1000])
X_submission = np.asarray(train_data_features[m+1000:])
# convert class vectors to binary class matrices
#Y_train = np_utils.to_categorical(Y_train, 2)
#Y_test = np_utils.to_categorical(Y_test, 2)

print "The test X is of shape:",X_test.shape
print "The test Y is of shape:",Y_test.shape
print "The Submission set is of shape:",X_submission.shape
print X_submission.dtype

The test X is of shape: (1000, 8703)
The test Y is of shape: (1000, 2)
The Submission set is of shape: (568, 8703)
int64


##### Setup the model global parameters

In [12]:
max_features = 8703
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 100
num_filters = 32
# side length of maxpooling square
num_pool = 2
# side length of convolution square
num_conv = 3
filter_length = 3
# size of the embedding layer
embedding_dims = 100
#number of output neurons for the first Dense layer
hidden_dims1 = 250
#number of output neurons for the second Dense layer
hidden_dims2 = 100
#this is the length to which each sentence is padded
paddedlength = 100
# 
output_size = num_filters * (((paddedlength - filter_length) / 1) + 1) / 2

##### Design model architecture

In [19]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features,256,input_length=8703, batch_input_shape=(batch_size,8703)))
model.add(LSTM(256,return_sequences=True,init='glorot_uniform', input_shape=(1,8703)))  # try using a GRU instead, for fun
model.add(Convolution1D(nb_filter=num_filters, filter_length=filter_length, border_mode="valid", activation="relu"))
model.add(Activation('relu'))
#model.add(Convolution1D(input_dim=embedding_dims, nb_filter=num_filters, filter_length=filter_length, border_mode="valid", activation="relu", subsample_length=1))
#model.add(MaxPooling1D(pool_length=2))
#model.add(Activation('relu'))

model.add(Flatten())

model.add(Dense(hidden_dims1))
model.add(Dropout(0.25))
model.add(Activation('relu'))
#model.add(Dense(hidden_dims2))
#model.add(Dropout(0.25))
#model.add(Activation('relu'))
model.add(Dense(2))
model.add(Dropout(0.25))
model.add(Activation('softmax'))

#model.add(LSTM(16, return_sequences=True)) 
#model.add(Activation('sigmoid'))
#model.add(LSTM(16,return_sequences=True))  # try using a GRU instead, for fun
#model.add(Activation('linear'))
#model.add(LSTM(16, return_sequences=True)) 
#model.add(Activation('linear'))
#model.add(LSTM(16, return_sequences=True))  
#model.add(Activation('sigmoid'))
#model.add(GRU(16, input_shape=(batch_size,64),dropout_W=0.2, dropout_U=0.2,return_sequences=True))  # try using a GRU instead, for fun
#model.add(Activation('linear'))
#model.add(LSTM(16, input_shape=(batch_size,64),dropout_W=0.2, dropout_U=0.2,return_sequences=False))  # try using a GRU instead, for fun


Build model...


##### Setup the optimizer

In [20]:
model.compile(loss='categorical_crossentropy',
              optimizer='SGD',
              metrics=['accuracy'])

##### Begin Training

In [21]:
print('Train...')
print(X_train.shape)
print(Y_train.shape)
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=10,
          validation_data=(X_test, Y_test), shuffle=True)
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
(4000, 8703)
(4000, 2)
Train on 4000 samples, validate on 1000 samples
Epoch 1/10
 100/4000 [..............................] - ETA: 7148s - loss: 0.7081 - acc: 0.3900

KeyboardInterrupt: 

##### Save model architecture to an image file

In [None]:
plot(model, to_file='model.png', show_shapes = True)

##### Save model parameters to retireve for prediction

In [None]:
print model.summary()

json_string = model.to_json()
open('my_model_architecture.json', 'w').write(json_string)
model.save_weights('my_model_weights.h5', overwrite=True)

In [None]:
print np.sum(X_train)

In [None]:
prediction_results = model.predict(X_train, verbose=1)

In [None]:
print np.sum(prediction_results)