In [67]:
import gc
import chardet
import re
import os
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64
print(os.listdir("../input"))

['embeddings', 'train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPool1D, Dropout, concatenate, Layer, InputSpec, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.utils.conv_utils import conv_output_length
from keras.regularizers import l2
from keras.constraints import maxnorm

Using TensorFlow backend.


In [68]:
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [7]:
train_df = pd.read_csv('../input/train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv('../input/test.csv', usecols = ['question_text'])

In [15]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: x.lower())
test_df['question_text'] = test_df['question_text'].apply(lambda x: x.lower())

pattern = re.compile(r"[^a-z ]+")
train_df['question_text'] = train_df['question_text'].apply(lambda x: re.sub(pattern, "", x))
test_df['question_text'] = test_df['question_text'].apply(lambda x: re.sub(pattern, "", x))

In [21]:
train_sentences = list(train_df['question_text'])
train_labels = list(train_df['target'])
test_sentences = list(test_df['question_text'])

In [22]:
gc.collect()

87

In [23]:
max_features = 20000
maxlen = 100

In [24]:
tokenizer = text.Tokenizer(num_words=max_features)

In [25]:
tokenizer.fit_on_texts(train_sentences)

In [26]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [27]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    9,
         48, 6524, 7212,  155,   55, 6062,   36,    4, 1181,    6,    1,
        295], dtype=int32)

In [28]:
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [29]:
del tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences
gc.collect()

0

In [36]:
batch_size = 512
epochs = 1

In [37]:
gc.collect()

0

In [38]:
def cudnnlstm_model(conv_layers = 2, max_dilation_rate = 3):
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.25)(x)
    x = Conv1D(2*embed_size, kernel_size = 3)(x)
    prefilt = Conv1D(2*embed_size, kernel_size = 3)(x)
    x = prefilt
    for strides in [1, 1, 2]:
        x = Conv1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x_f = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)  
    x_b = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x = concatenate([x_f, x_b])
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])

    return model

cudnnlstm_model = cudnnlstm_model()
cudnnlstm_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 128)     2560000     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 100, 128)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 98, 256)      98560       dropout_4[0][0]                  
__________________________________________________________________________________________________
conv1d_7 (

In [39]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks = [checkpoint, early_stopping]

In [40]:
cudnnlstm_model.fit(X_train, train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.21314, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7f0ad3840da0>

In [41]:
cudnnlstm_model.load_weights(weight_path)
y_pred = cudnnlstm_model.predict(X_test, batch_size=batch_size)
y_pred = [x for i in y_pred for x in i]

In [43]:
sample = pd.read_csv('../input/sample_submission.csv')

In [55]:
sample['prediction'] = pd.Series(y_pred)

In [60]:
sample['prediction'] = sample['prediction'].apply(lambda x: 0 if x < 0.2 else 1)

In [64]:
sample.to_csv('out.csv', index=False)

In [None]:
create_download_link(sample)