In [1]:
import re
import csv
import nltk
import keras
import string
import gensim
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [2]:
# The function "text_to_wordlist" is adapted from:
# kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text

def text_to_wordlist(
                     question, 
                     remove_stopwords   = False, 
                     stem_words         = False,
                     remove_punctuation = False
                    ):

    question = re.sub( r"[^A-Za-z0-9^,!.\/'+-=]", " "          , question)
    question = re.sub( r"what's"                , "what is "   , question)
    question = re.sub( r"\'s"                   , " "          , question)
    question = re.sub( r"\'ve"                  , " have "     , question)
    question = re.sub( r"can't"                 , "cannot "    , question)
    question = re.sub( r"n't"                   , " not "      , question)
    question = re.sub( r"i'm"                   , "i am "      , question)
    question = re.sub( r"\'re"                  , " are "      , question)
    question = re.sub( r"\'d"                   , " would "    , question)
    question = re.sub( r"\'ll"                  , " will "     , question)
    question = re.sub( r","                     , " "          , question)
    question = re.sub( r"\."                    , " "          , question)
    question = re.sub( r"!"                     , " ! "        , question)
    question = re.sub( r"\/"                    , " "          , question)
    question = re.sub( r"\^"                    , " ^ "        , question)
    question = re.sub( r"\+"                    , " + "        , question)
    question = re.sub( r"\-"                    , " - "        , question)
    question = re.sub( r"\="                    , " = "        , question)
    question = re.sub( r"'"                     , " "          , question)
    question = re.sub( r"(\d+)(k)"              , r"\g<1>000"  , question)
    question = re.sub( r":"                     , " : "        , question)
    question = re.sub( r" e g "                 , " eg "       , question)
    question = re.sub( r" b g "                 , " bg "       , question)
    question = re.sub( r" u s "                 , " american " , question)
    question = re.sub( r"\0s"                   , "0"          , question)
    question = re.sub( r" 9 11 "                , "911"        , question)
    question = re.sub( r"e - mail"              , "email"      , question)
    question = re.sub( r"j k"                   , "jk"         , question)
    question = re.sub( r"\s{2,}"                , " "          , question)

    if remove_stopwords:
        question = question.lower().split()
        question = [w for w in question if not w in nltk.corpus.stopwords.words("english")]
        question = ' '.join(question)

    if stem_words:        
        question      = question.lower().split()
        stemmed_words = [nltk.stem.SnowballStemmer('english').stemmer.stem(word) for word in question]
        question      = ' '.join(stemmed_words)

    if remove_punctuation:
        question = ''.join([character for character in question if character not in string.punctuation])
    
    return(question)

In [3]:
####################################################

print 'Processing training dataset'

training_questions_1 = [] 
training_questions_2 = []
training_labels      = []

with open( '/home/ubuntu/train.csv' ) as f:
    
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    
    for row in reader:
        
        training_questions_1.append( text_to_wordlist ( row[3] ))
        training_questions_2.append( text_to_wordlist ( row[4] ))
        training_labels.append( int(row[5] ))
        
print 'Found %s question pairs in train.csv' % len(training_questions_1)

####################################################

print 'Processing test dataset'

test_questions_1  = []
test_questions_2  = []
test_question_ids = []

with open( '/home/ubuntu/test.csv' ) as f:
    
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    
    for row in reader:
        
        test_questions_1.append( text_to_wordlist ( row[1] ))
        test_questions_2.append( text_to_wordlist ( row[2] ))
        test_question_ids.append( row[0] )
        
print 'Found %s question pairs in test.csv' % len(test_questions_1)

####################################################

Processing training dataset
Found 404290 question pairs in train.csv
Processing test dataset
Found 2345796 question pairs in test.csv


In [4]:
from keras.preprocessing.text import Tokenizer

maximum_number_of_words = 200000

####################################################

print 'Tokenizing Words with KERAS'

tokenizer = Tokenizer ( num_words = maximum_number_of_words )

tokenizer.fit_on_texts ( 
                        training_questions_1 + 
                        training_questions_2 + 
                        test_questions_1     +
                        test_questions_2
                       )

training_sequences_1 = tokenizer.texts_to_sequences( training_questions_1 )
training_sequences_2 = tokenizer.texts_to_sequences( training_questions_2 )
test_sequences_1     = tokenizer.texts_to_sequences( test_questions_1 )
test_sequences_2     = tokenizer.texts_to_sequences( test_questions_2 )

word_index = tokenizer.word_index

print 'Found %s unique word tokens' % len(word_index)

####################################################

Tokenizing Words with KERAS
Found 120539 unique word tokens


In [5]:
from keras.preprocessing.sequence import pad_sequences

maximum_sequence_length = 30

####################################################

print 'Creating Tensors with KERAS'

training_data_1   = pad_sequences (
                                   sequences = training_sequences_1, 
                                   maxlen    = maximum_sequence_length
                                  )

training_data_2   = pad_sequences (
                                   sequences = training_sequences_2, 
                                   maxlen    = maximum_sequence_length 
                                  )

training_labels   = np.array( training_labels )

test_data_1       = pad_sequences (
                                   sequences = test_sequences_1, 
                                   maxlen    = maximum_sequence_length
                                  )

test_data_2       = pad_sequences ( 
                                   sequences = test_sequences_2, 
                                   maxlen    = maximum_sequence_length
                                  )

test_question_ids = np.array(test_question_ids)

print 'Shape of training data tensor:',  training_data_1.shape
print 'Shape of testing data tensor:',   test_data_1.shape
print 'Shape of label tensor:',          training_labels.shape

####################################################

Creating Tensors with KERAS
Shape of training data tensor: (404290, 30)
Shape of testing data tensor: (2345796, 30)
Shape of label tensor: (404290,)


In [6]:
####################################################

print 'Indexing word vectors'

word2vec = gensim.models.KeyedVectors.load_word2vec_format(
                                                           '/home/ubuntu/GoogleNews-vectors-negative300.bin', 
                                                           binary=True
                                                          )

print 'Found %s word vectors of word2vec' % len(word2vec.vocab)

####################################################

Indexing word vectors
Found 3000000 word vectors of word2vec


In [7]:
embedding_dimension = 300

####################################################

print 'Preparing embedding matrix'

number_of_words  = min( maximum_number_of_words, len(word_index) )+1

embedding_matrix = np.zeros( (number_of_words, embedding_dimension) )

for word, i in word_index.items():
    
    if word in word2vec.vocab:
        
        embedding_matrix[i] = word2vec.word_vec(word)

print 'Embedding Matrix Shape:', embedding_matrix.shape

####################################################

Preparing embedding matrix
Embedding Matrix Shape: (120540, 300)


In [8]:
validation_split = 0.1

####################################################

print 'Prepared training and validation data matrices'

permutations      = np.random.permutation( len(training_data_1) )

train_indices     = permutations[:int( len(training_data_1) * (1 - validation_split) )]
validate_indicies = permutations[int(  len(training_data_1) * (1 - validation_split) ):]

train_data_1      = np.vstack(( training_data_1[ train_indices ], training_data_2[ train_indices ]))
train_data_2      = np.vstack(( training_data_2[ train_indices ], training_data_1[ train_indices ]))
train_labels      = np.concatenate(( training_labels[ train_indices ], training_labels[ train_indices ]))

validate_data_1   = np.vstack(( training_data_1[ validate_indicies ], training_data_2[ validate_indicies ]))
validate_data_2   = np.vstack(( training_data_2[ validate_indicies ], training_data_1[ validate_indicies ]))
validate_labels   = np.concatenate(( training_labels[ validate_indicies ], training_labels[ validate_indicies ]))

weight_values     = np.ones( len(validate_labels) )

re_weight         = True # whether to re-weight classes to fit the 17.5% share in test set

if re_weight:

    weight_values *= 0.472001959
    weight_values[validate_labels == 0] = 1.309028344
    class_weight = {0: 1.309028344, 1: 0.472001959}
    
else:
    
    class_weight = None

####################################################

Prepared training and validation data matrices


In [9]:
from keras.layers.merge import concatenate

num_lstm        = np.random.randint(175, 275)
rate_drop_lstm  = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
num_dense       = np.random.randint(100, 150)

####################################################

print 'Defined the model'

embedding_layer = keras.layers.Embedding (
                                          input_dim             = number_of_words,
                                          output_dim            = embedding_dimension,
                                          weights               = [embedding_matrix],
                                          input_length          = maximum_sequence_length,
                                          trainable             = False
                                         )

lstm_layer      = keras.layers.LSTM      (
                                          units                 = num_lstm, 
                                          dropout               = rate_drop_lstm, 
                                          recurrent_dropout     = rate_drop_lstm
                                         )

sequence_1_input     = keras.layers.Input( shape = ( maximum_sequence_length, ), dtype='int32')
embedded_sequence_1  = embedding_layer( sequence_1_input )
x1                   = lstm_layer( embedded_sequence_1 )

sequence_2_input     = keras.layers.Input( shape = ( maximum_sequence_length, ), dtype='int32')
embedded_sequence_2  = embedding_layer( sequence_2_input )
y1                   = lstm_layer( embedded_sequence_2 )

merged_model         = concatenate( [x1, y1] )
merged_model         = keras.layers.Dropout( rate_drop_dense )( merged_model )
merged_model         = keras.layers.normalization.BatchNormalization()( merged_model )

merged_model         = keras.layers.Dense( num_dense, activation='relu' )( merged_model )
merged_model         = keras.layers.Dropout( rate_drop_dense )( merged_model )
merged_model         = keras.layers.normalization.BatchNormalization()( merged_model )

predictions          = keras.layers.Dense( 1, activation='sigmoid' )( merged_model )

####################################################

Defined the model


In [10]:
####################################################

print 'Execute the model'

model = keras.models.Model (
                            inputs                                    = [sequence_1_input, sequence_2_input], 
                            outputs                                   = predictions
                           )

model.compile (
               loss                                                   = 'binary_crossentropy',
               optimizer                                              = 'nadam',
               metrics                                                = ['acc']
              )

model.summary()

early_stopping = keras.callbacks.EarlyStopping (
                                                monitor               = 'val_loss', 
                                                patience              = 3
                                               )

model_name = 'lstm_%d_%d_%.2f_%.2f' % (
                                       num_lstm, 
                                       num_dense, 
                                       rate_drop_lstm, 
                                       rate_drop_dense
                                      ) + '.h5'

model_checkpoint = keras.callbacks.ModelCheckpoint (
                                                    filepath          = model_name, 
                                                    save_best_only    = True, 
                                                    save_weights_only = True
                                                   )

hist = model.fit (
                  x                                                   = [train_data_1, train_data_2], 
                  y                                                   = train_labels,
                  epochs                                              = 2, 
                  batch_size                                          = 2048, 
                  shuffle                                             = True,
                  class_weight                                        = class_weight, 
                  callbacks                                           = [early_stopping, model_checkpoint],
                  validation_data                                     = (
                                                                         [validate_data_1, validate_data_2], 
                                                                         validate_labels, 
                                                                         weight_values
                                                                        )
                 )

model.load_weights( model_name )

print '\n Best Score:', min(hist.history['val_loss'])

####################################################

Execute the model
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 30, 300)       36162000    input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 180)           346320      embedd

In [12]:
####################################################

print 'Save the predictions to a file for submission'

predictions = model.predict  (
                              [test_data_1, test_data_2], 
                              batch_size = 8192, 
                              verbose    = 1
                             )

predictions += model.predict (
                              [test_data_2, test_data_1], 
                              batch_size = 8192, 
                              verbose    = 1
                             )

predictions /= 2

submission = pd.DataFrame ( {
                             'test_id'      : test_question_ids, 
                             'is_duplicate' : predictions.ravel()
                          } )

submission.to_csv( 'LSTM_submission_to_kaggle.csv', index=False )

print 'Model Saved'

####################################################

Save the predictions to a file for submission
Model Saved


In [13]:
!pwd
!ls -l LSTM_submission_to_kaggle.csv

/home/ubuntu/pynb
-rw-rw-r-- 1 ubuntu ubuntu 53819462 Aug 11 21:33 LSTM_submission_to_kaggle.csv
