In [115]:
import gc
import re
import os
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
print(os.listdir("../input"))

['embeddings', 'train.csv', 'sample_submission.csv', 'test.csv']


In [100]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPooling1D, Dropout, concatenate, Layer, InputSpec, CuDNNLSTM, CuDNNGRU, Bidirectional, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.constraints import maxnorm
from keras.callbacks import Callback
from keras import optimizers

In [4]:
def f1_score(true,pred): #considering sigmoid activation, threshold = 0.5
    pred = K.cast(K.greater(pred,0.5), K.floatx())

    groundPositives = K.sum(true) + K.epsilon()
    correctPositives = K.sum(true * pred) + K.epsilon()
    predictedPositives = K.sum(pred) + K.epsilon()

    precision = correctPositives / predictedPositives
    recall = correctPositives / groundPositives

    m = (2 * precision * recall) / (precision + recall)

    return m

In [169]:
train_df = pd.read_csv('../input/train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv('../input/test.csv', usecols = ['question_text'])

In [170]:
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)

def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

In [171]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_text(str(x)))
test_df['question_text'] = test_df['question_text'].apply(lambda x: clean_text(str(x)))

train_sentences = train_df['question_text']
train_labels = train_df['target']
test_sentences = test_df['question_text']

In [172]:
def add_features(df):
    
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/(float(row['total_length'])+1),
                                axis=1)
    df['num_words'] = df['question_text'].str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1)
    del df['num_unique_words'], df['num_words'], df['capitals'], df['total_length']
    return df

In [173]:
train_df = add_features(train_df)
test_df = add_features(test_df)

In [174]:
train_features = train_df[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test_df[['caps_vs_length', 'words_vs_unique']].fillna(0)

In [175]:
train_df.head()

Unnamed: 0,question_text,target,caps_vs_length,words_vs_unique
0,How did Quebec nationalists see their province...,0,0.028986,0.928571
1,"Do you have an adopted dog, how would you enco...",0,0.012195,0.882353
2,Why does velocity affect time? Does velocity a...,0,0.029412,0.727273
3,How did Otto von Guericke used the Magdeburg h...,0,0.068966,0.9
4,Can I convert montra helicon D to a mountain b...,0,0.038462,0.9375


In [176]:
train_features.head()

Unnamed: 0,caps_vs_length,words_vs_unique
0,0.028986,0.928571
1,0.012195,0.882353
2,0.029412,0.727273
3,0.068966,0.9
4,0.038462,0.9375


In [177]:
gc.collect()

213

In [145]:
max_features = 20000
maxlen = 100

In [146]:
tokenizer = text.Tokenizer(num_words=max_features)

In [147]:
tokenizer.fit_on_texts(list(train_sentences) + list(test_sentences))

In [148]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [149]:
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [150]:
del tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences
gc.collect()

0

In [151]:
batch_size = 1024
epochs = 4
embed_size = 300

In [162]:
gc.collect()

91

In [153]:
def cudnnlstm_model(features):
    features_input = Input(shape=(features.shape[1],))
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(CuDNNLSTM(64, kernel_initializer='glorot_normal', return_sequences = True))(x)
    x, x_h, x_c = Bidirectional(CuDNNGRU(64, kernel_initializer='glorot_normal', return_sequences=True, return_state = True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, x_h, max_pool, features_input])
    x = Dense(32, activation="tanh", kernel_initializer='glorot_normal')(x)
    x = Dense(1, activation="sigmoid", kernel_initializer='glorot_normal')(x)
    model = Model(inputs=inp, outputs=x)
    adam = optimizers.adam(clipvalue=1.0)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=[f1_score])

    return model

In [154]:
cudnnlstm_model = cudnnlstm_model()
cudnnlstm_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, 100, 300)     6000000     input_33[0][0]                   
__________________________________________________________________________________________________
bidirectional_65 (Bidirectional (None, 100, 128)     187392      embedding_33[0][0]               
__________________________________________________________________________________________________
bidirectional_66 (Bidirectional [(None, 100, 128), ( 74496       bidirectional_65[0][0]           
__________________________________________________________________________________________________
global_ave

In [155]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=2)
callbacks = [checkpoint, early_stopping]

In [156]:
cudnnlstm_model.fit(X_train, train_labels, batch_size=batch_size, epochs=2, shuffle = True, validation_split=0.20, callbacks=callbacks)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.60516, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.60516 to 0.62674, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fa73be861d0>

In [157]:
cudnnlstm_model.load_weights(weight_path)
y_pred = cudnnlstm_model.predict(X_test, batch_size=batch_size)
y_pred = [x for i in y_pred for x in i]

In [158]:
sample = pd.read_csv('../input/sample_submission.csv')

In [159]:
sample['prediction'] = pd.Series(y_pred)

In [160]:
sample['prediction'] = sample['prediction'].apply(lambda x: 0 if x <= 0.5 else 1)

In [161]:
sample.to_csv('submission.csv', index=False)