## Imports

In [1]:
import datetime
import os

In [2]:
from sklearn.model_selection import StratifiedKFold

In [3]:
from keras import backend as K
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


## Config

In [4]:
feature_list_id = 'oofp_nn_concat_dense_1'

In [5]:
RANDOM_SEED = 42

In [6]:
np.random.seed(RANDOM_SEED)

In [7]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [8]:
embedding_matrix = load(aux_data_folder + 'embedding_weights_fasttext_filtered_no_stopwords.pickle')

In [9]:
X_train_q1 = load(features_data_folder + 'X_train_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_train_q2 = load(features_data_folder + 'X_train_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [10]:
X_train = np.stack((X_train_q1, X_train_q2), axis=1)

In [11]:
y_train = load(features_data_folder + 'y_train.pickle')

In [12]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0] - 1
MAX_SEQUENCE_LENGTH = X_train.shape[-1]

In [13]:
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)

300 101441 30


## Train Models & Compute Out-of-Fold Predictions

In [14]:
def create_model():
    model_q1 = Sequential()

    model_q1.add(Embedding(
        VOCAB_LENGTH + 1,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    ))

    model_q1.add(TimeDistributed(Dense(
        EMBEDDING_DIM,
        activation='relu',
    )))

    model_q1.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))

    model_q2 = Sequential()

    model_q2.add(Embedding(
        VOCAB_LENGTH + 1,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    ))

    model_q2.add(TimeDistributed(Dense(
        EMBEDDING_DIM,
        activation='relu'
    )))

    model_q2.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))

    model = Sequential()
    model.add(Merge([model_q1, model_q2], mode='concat'))

    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))

    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))

    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))

    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))

    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        loss='binary_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )

    return model

In [15]:
model_checkpoint_path = aux_data_folder + 'fold-checkpoint-' + feature_list_id + '.h5'

In [16]:
kfold = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [17]:
y_train_oofp = np.zeros_like(y_train, dtype='float64')

In [18]:
%%time

for fold_num, (ix_fold_train, ix_fold_val) in enumerate(kfold.split(X_train, y_train)):
    X_fold_train = X_train[ix_fold_train]
    y_fold_train = y_train[ix_fold_train]
    
    X_fold_val = X_train[ix_fold_val]
    y_fold_val = y_train[ix_fold_val]
    
    print()
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    print()
    
    model = create_model()
    model.fit(
        [X_fold_train[:, 0], X_fold_train[:, 1]], y_fold_train,
        batch_size=64,
        epochs=200,
        validation_data=([X_fold_val[:, 0], X_fold_val[:, 1]], y_fold_val),
        class_weight=keras_get_class_weights(y_fold_train),
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.005,
                patience=3,
                verbose=1,
                mode='auto',
            ),
            ModelCheckpoint(
                model_checkpoint_path,
                monitor='val_loss',
                save_best_only=True,
                verbose=2,
            ),
        ],
    )
        
    # Create out-of-fold prediction.
    model.load_weights(model_checkpoint_path)
    y_pred_oofp = model.predict_proba([X_fold_val[:, 0], X_fold_val[:, 1]])[:, -1]
    
    # Remember them.
    y_train_oofp[ix_fold_val] = y_pred_oofp


Fitting fold 1 of 5





Train on 323431 samples, validate on 80859 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 00011: early stopping
Fitting fold 2 of 5

Train on 323431 samples, validate on 80859 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 00007: early stopping
Fitting fold 3 of 5

Train on 323432 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200


Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 00008: early stopping
Fitting fold 4 of 5

Train on 323433 samples, validate on 80857 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00009: early stopping
Fitting fold 5 of 5

Train on 323433 samples, validate on 80857 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200


Epoch 9/200
Epoch 10/200
Epoch 00009: early stopping
Wall time: 28min 3s


### Save Train features

In [19]:
y_train_oofp = y_train_oofp.reshape((-1, 1))

In [20]:
save(y_train_oofp, features_data_folder + f'X_train_{feature_list_id}.pickle')

## Save Test features

In [21]:
X_test_q1 = load(features_data_folder + 'X_test_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_test_q2 = load(features_data_folder + 'X_test_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [22]:
# It would be better to fit the model on the whole training set
# but the validation set for early stopping would be an issue.
y_test_oofp = model.predict_proba([X_test_q1, X_test_q2])[:, -1]



In [23]:
y_test_oofp = y_test_oofp.reshape((-1, 1))

In [24]:
save(y_test_oofp, features_data_folder + f'X_test_{feature_list_id}.pickle')