# Vanilla LSTMS, 05052017

val loss 0.49, 

In [2]:
from datetime import datetime
from IPython.display import SVG

import pandas as pd
import numpy as np

from keras.models import Model
from keras.layers import Dense, Dropout, Input, LSTM, Embedding
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, ProgbarLogger, TensorBoard
from keras_tqdm import TQDMNotebookCallback

from utils import load_embeddings, extract_questions_from_dataframe, save_submission

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [3]:
class Config(object):
    VOCABULARY_SIZE = 1193514
    EMBEDDING_DIMENSION = 200
    OFFSET = 3
    OOV_TOKEN = 0  # out of vocabulary
    EOS_TOKEN = 1  # end of sentence
    PAD_TOKEN = 2  # padding to max sentence length
    MAX_SENTENCE_LENGTH = 60
    
    def stamp(self, comment):
        return '{date:%Y%m%d_%H%M}_{comment}'.format(
            date=datetime.now(), comment=comment)

In [4]:
%%time
train_dataframe = pd.read_csv('train.csv')
current_config = Config()

embedding_weights, word2idx = load_embeddings(
    'glove.twitter.27B.200d.txt',
    config=current_config
)

questions_A, questions_B, labels = extract_questions_from_dataframe(
    train_dataframe, 
    config=current_config,
    word2idx=word2idx,
    prediction_mode=False
)

No saved file, preprocessing from scratch


96463 questions preprocessed
CPU times: user 2min 33s, sys: 6.16 s, total: 2min 39s
Wall time: 2min 46s


In [13]:
questions_A.shape

(96463, 60)

In [5]:
shared_lstm_layer = LSTM(
    units=100, 
    return_sequences=False, 
    go_backwards=True, 
    dropout=0.15 + np.random.rand() * 0.25
)
shared_embedding_layer = Embedding(
    input_dim=current_config.VOCABULARY_SIZE + current_config.OFFSET, 
    output_dim=current_config.EMBEDDING_DIMENSION, 
    input_length=current_config.MAX_SENTENCE_LENGTH,
    weights=[embedding_weights],
    trainable=False
)

input_A = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_A = shared_embedding_layer(input_A)
sentence_representation_A = shared_lstm_layer(embeddings_A)
dropout_A = Dropout(0.5)(sentence_representation_A)

input_B = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_B = shared_embedding_layer(input_B)
sentence_representation_B = shared_lstm_layer(embeddings_B)
dropout_B = Dropout(0.5)(sentence_representation_B)

merged_model = concatenate([dropout_A, dropout_B])
predictions = Dense(1, activation='sigmoid')(merged_model)

model = Model(inputs=[input_A, input_B], outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 60, 200)       238703400   input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           120400      embedding_1[0][0]       

In [9]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(
    monitor='val_loss',
    filepath=current_config.stamp(comment='1') + '.h5', 
    save_best_only=True, 
    save_weights_only=True
)
progress_bar = ProgbarLogger(count_mode='samples')
tensorboard = TensorBoard(
    log_dir='./logs', 
    histogram_freq=1,
)
tqdm_bar = TQDMNotebookCallback()

In [10]:
%%time

training_logs = model.fit(
    x=[questions_A, questions_B], 
    y=labels, 
    epochs=1, 
    batch_size=1024,
    validation_split=0.1, 
    verbose=1,
    callbacks=[early_stopping, model_checkpoint, progress_bar, tensorboard, tqdm_bar]
)

Train on 86816 samples, validate on 9647 samples


INFO:tensorflow:Summary name embedding_1/embeddings:0 is illegal; using embedding_1/embeddings_0 instead.


INFO:tensorflow:Summary name lstm_1/kernel:0 is illegal; using lstm_1/kernel_0 instead.


INFO:tensorflow:Summary name lstm_1/recurrent_kernel:0 is illegal; using lstm_1/recurrent_kernel_0 instead.


INFO:tensorflow:Summary name lstm_1/bias:0 is illegal; using lstm_1/bias_0 instead.


INFO:tensorflow:Summary name dense_1/kernel:0 is illegal; using dense_1/kernel_0 instead.


INFO:tensorflow:Summary name dense_1/bias:0 is illegal; using dense_1/bias_0 instead.


Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.


Epoch 1/1


Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.


Epoch 1/1


 1024/86816 [..............................] - ETA: 644s - loss: 0.6931 - acc: 0.6396

 1024/86816 [..............................] - ETA: 640s - loss: 0.6931 - acc: 0.6396

 2048/86816 [..............................] - ETA: 585s - loss: 0.6891 - acc: 0.6689

 2048/86816 [..............................] - ETA: 583s - loss: 0.6891 - acc: 0.6689

 3072/86816 [>.............................] - ETA: 545s - loss: 0.6842 - acc: 0.6813

 3072/86816 [>.............................] - ETA: 544s - loss: 0.6842 - acc: 0.6813

 4096/86816 [>.............................] - ETA: 533s - loss: 0.6736 - acc: 0.6973

 4096/86816 [>.............................] - ETA: 532s - loss: 0.6736 - acc: 0.6973

 5120/86816 [>.............................] - ETA: 525s - loss: 0.6553 - acc: 0.7045

 5120/86816 [>.............................] - ETA: 524s - loss: 0.6553 - acc: 0.7045

 6144/86816 [=>............................] - ETA: 513s - loss: 0.6530 - acc: 0.7085

 6144/86816 [=>............................] - ETA: 512s - loss: 0.6530 - acc: 0.7085

 7168/86816 [=>............................] - ETA: 503s - loss: 0.6485 - acc: 0.7112

 7168/86816 [=>............................] - ETA: 503s - loss: 0.6485 - acc: 0.7112

 8192/86816 [=>............................] - ETA: 502s - loss: 0.6443 - acc: 0.7098

 8192/86816 [=>............................] - ETA: 502s - loss: 0.6443 - acc: 0.7098

 9216/86816 [==>...........................] - ETA: 500s - loss: 0.6396 - acc: 0.7087

 9216/86816 [==>...........................] - ETA: 500s - loss: 0.6396 - acc: 0.7087

10240/86816 [==>...........................] - ETA: 483s - loss: 0.6366 - acc: 0.7071

10240/86816 [==>...........................] - ETA: 483s - loss: 0.6366 - acc: 0.7071

11264/86816 [==>...........................] - ETA: 469s - loss: 0.6328 - acc: 0.7069

11264/86816 [==>...........................] - ETA: 469s - loss: 0.6328 - acc: 0.7069

12288/86816 [===>..........................] - ETA: 466s - loss: 0.6275 - acc: 0.7087

12288/86816 [===>..........................] - ETA: 465s - loss: 0.6275 - acc: 0.7087

13312/86816 [===>..........................] - ETA: 457s - loss: 0.6263 - acc: 0.7102

13312/86816 [===>..........................] - ETA: 457s - loss: 0.6263 - acc: 0.7102

14336/86816 [===>..........................] - ETA: 452s - loss: 0.6261 - acc: 0.7109

14336/86816 [===>..........................] - ETA: 452s - loss: 0.6261 - acc: 0.7109

15360/86816 [====>.........................] - ETA: 457s - loss: 0.6244 - acc: 0.7130

15360/86816 [====>.........................] - ETA: 457s - loss: 0.6244 - acc: 0.7130

16384/86816 [====>.........................] - ETA: 454s - loss: 0.6221 - acc: 0.7139

16384/86816 [====>.........................] - ETA: 454s - loss: 0.6221 - acc: 0.7139

17408/86816 [=====>........................] - ETA: 458s - loss: 0.6198 - acc: 0.7142

17408/86816 [=====>........................] - ETA: 458s - loss: 0.6198 - acc: 0.7142

18432/86816 [=====>........................] - ETA: 452s - loss: 0.6182 - acc: 0.7139

18432/86816 [=====>........................] - ETA: 452s - loss: 0.6182 - acc: 0.7139

19456/86816 [=====>........................] - ETA: 447s - loss: 0.6176 - acc: 0.7126

19456/86816 [=====>........................] - ETA: 446s - loss: 0.6176 - acc: 0.7126





































































































































































































































































          86016/|/[loss: 0.572, acc: 0.728]  99%|| 86016/86816 [09:30<00:04, 170.24it/s]










CPU times: user 24min 26s, sys: 5min 42s, total: 30min 9s
Wall time: 10min 28s


In [12]:
model.load_weights('20170504_2334_1.h5')
min(training_logs.history['val_loss'])

0.49261399169060632

In [16]:
test_dataframe = pd.read_csv('test.csv')
test_questions_A, test_questions_B, _ = extract_questions_from_dataframe(
    test_dataframe,
    config=current_config,
    word2idx=word2idx,
    prediction_mode=True
)

No saved file, preprocessing from scratch


2345796 questions preprocessed


In [17]:
predictions = model.predict(
    x=[test_questions_A, test_questions_B], 
    batch_size=8192, 
    verbose=1
)

   8192/2345796 [..............................] - ETA: 8749s

  16384/2345796 [..............................] - ETA: 7783s

  24576/2345796 [..............................] - ETA: 7248s

  32768/2345796 [..............................] - ETA: 6937s

  40960/2345796 [..............................] - ETA: 6804s

  49152/2345796 [..............................] - ETA: 6840s

  57344/2345796 [..............................] - ETA: 7065s

  65536/2345796 [..............................] - ETA: 7439s

  73728/2345796 [..............................] - ETA: 7563s

  81920/2345796 [>.............................] - ETA: 7831s

  90112/2345796 [>.............................] - ETA: 7776s

  98304/2345796 [>.............................] - ETA: 7658s

 106496/2345796 [>.............................] - ETA: 7600s

 114688/2345796 [>.............................] - ETA: 7554s

 122880/2345796 [>.............................] - ETA: 7423s

 131072/2345796 [>.............................] - ETA: 7315s

 139264/2345796 [>.............................] - ETA: 7204s

 147456/2345796 [>.............................] - ETA: 7104s

 155648/2345796 [>.............................] - ETA: 7068s

 163840/2345796 [=>............................] - ETA: 7074s

 172032/2345796 [=>............................] - ETA: 6990s

 180224/2345796 [=>............................] - ETA: 6884s

 188416/2345796 [=>............................] - ETA: 6795s

 196608/2345796 [=>............................] - ETA: 6709s

 204800/2345796 [=>............................] - ETA: 6623s

 212992/2345796 [=>............................] - ETA: 6547s

 221184/2345796 [=>............................] - ETA: 6465s

 229376/2345796 [=>............................] - ETA: 6392s

 237568/2345796 [==>...........................] - ETA: 6317s

 245760/2345796 [==>...........................] - ETA: 6295s

 253952/2345796 [==>...........................] - ETA: 6286s

 262144/2345796 [==>...........................] - ETA: 6310s

 270336/2345796 [==>...........................] - ETA: 6294s

 278528/2345796 [==>...........................] - ETA: 6279s

 286720/2345796 [==>...........................] - ETA: 6242s

 294912/2345796 [==>...........................] - ETA: 6235s

 303104/2345796 [==>...........................] - ETA: 6217s

 311296/2345796 [==>...........................] - ETA: 6181s

 319488/2345796 [===>..........................] - ETA: 6171s

 327680/2345796 [===>..........................] - ETA: 6154s

 335872/2345796 [===>..........................] - ETA: 6109s

 344064/2345796 [===>..........................] - ETA: 6083s

 352256/2345796 [===>..........................] - ETA: 6073s

 360448/2345796 [===>..........................] - ETA: 6056s

 368640/2345796 [===>..........................] - ETA: 6028s

 376832/2345796 [===>..........................] - ETA: 5985s

 385024/2345796 [===>..........................] - ETA: 5940s

 393216/2345796 [====>.........................] - ETA: 5934s

 401408/2345796 [====>.........................] - ETA: 5929s

 409600/2345796 [====>.........................] - ETA: 5908s

 417792/2345796 [====>.........................] - ETA: 5881s

 425984/2345796 [====>.........................] - ETA: 5846s

 434176/2345796 [====>.........................] - ETA: 5802s

 442368/2345796 [====>.........................] - ETA: 5775s

 450560/2345796 [====>.........................] - ETA: 5732s

 458752/2345796 [====>.........................] - ETA: 5690s

 466944/2345796 [====>.........................] - ETA: 5647s

 475136/2345796 [=====>........................] - ETA: 5604s

 483328/2345796 [=====>........................] - ETA: 5555s

 491520/2345796 [=====>........................] - ETA: 5549s

 499712/2345796 [=====>........................] - ETA: 5548s

 507904/2345796 [=====>........................] - ETA: 5530s

 516096/2345796 [=====>........................] - ETA: 5514s

 524288/2345796 [=====>........................] - ETA: 5497s

 532480/2345796 [=====>........................] - ETA: 5472s

 540672/2345796 [=====>........................] - ETA: 5445s






























































































































































































































































































































































































































































In [18]:
save_submission(predictions, current_config)

Unnamed: 0,is_duplicate
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,1
8,0
9,0


In [19]:
!wc -l 20170505_0156_1.csv

 2345797 20170505_0156_1.csv
