## Imports

In [None]:
import datetime
import gc

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

In [None]:
cuda_use_gpus(1)

In [None]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

## Config

In [None]:
feature_list_id = 'oofp_currie32_cnn_activations'

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 200
BATCH_SIZE = 128

In [None]:
RANDOM_SEED = 42

In [None]:
np.random.seed(RANDOM_SEED)

## Read Data

In [None]:
embedding_matrix = load(aux_data_folder + 'embedding_weights_fasttext_filtered_no_stopwords.pickle')

In [None]:
X_train_q1 = load(features_data_folder + 'X_train_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_train_q2 = load(features_data_folder + 'X_train_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [None]:
X_test_q1 = load(features_data_folder + 'X_test_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_test_q2 = load(features_data_folder + 'X_test_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

In [None]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]

In [None]:
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)

## Train Models & Compute Out-of-Fold Predictions

In [None]:
init_weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
init_bias = 'zeros'

In [None]:
def create_embedding_block():
    input_seq = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    
    embedding_seq = Embedding(
        VOCAB_LENGTH,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    )(input_seq)
    
    output_seq = embedding_seq
    return input_seq, output_seq

In [None]:
def create_model_question_conv_branch(input_seq, params):
    conv_1 = Conv1D(
        params['num_conv_filters'],
        kernel_size=params['conv_kernel_size'],
        padding='same',
    )(input_seq)
    
    bn_1 = BatchNormalization()(conv_1)
    relu_1 = Activation('relu')(bn_1)
    dropout_1 = Dropout(params['conv_dropout_rate'])(relu_1)

    conv_2 = Conv1D(
        params['num_conv_filters'],
        kernel_size=params['conv_kernel_size'],
        padding='same',
    )(dropout_1)
    
    bn_2 = BatchNormalization()(conv_2)
    relu_2 = Activation('relu')(bn_2)
    dropout_2 = Dropout(params['conv_dropout_rate'])(relu_2)
    
    flatten = Flatten()(dropout_2)
    output = flatten
    
    return output

In [None]:
def create_model_question_timedist_max_branch(input_seq, params):
    timedist = TimeDistributed(Dense(EMBEDDING_DIM))(input_seq)
    bn = BatchNormalization()(timedist)
    relu = Activation('relu')(bn)
    dropout = Dropout(params['timedist_dropout_rate'])(relu)

    lambda_max = Lambda(
        lambda x: K.max(x, axis=1),
        output_shape=(EMBEDDING_DIM, )
    )(dropout)
    
    output = lambda_max
    return output

In [None]:
def create_dense_block(input_layer, num_units, dropout_rate):
    dense = Dense(
        num_units,
        kernel_initializer=init_weights,
        bias_initializer=init_bias,
    )(input_layer)
    bn = BatchNormalization()(dense)
    relu = Activation('relu')(bn)
    dropout = Dropout(dropout_rate)(relu)
    output = dropout
    
    return output

In [None]:
def zero_loss(y_true, y_pred):
    return K.zeros((1,))

In [None]:
def create_model(params):
    input_q1, emb_q1 = create_embedding_block()
    input_q2, emb_q2 = create_embedding_block()
    
    # Feature extractors.
    conv_q1_output = create_model_question_conv_branch(emb_q1, params)
    conv_q2_output = create_model_question_conv_branch(emb_q2, params)
    
    timedist_q1_output = create_model_question_timedist_max_branch(emb_q1, params)
    timedist_q2_output = create_model_question_timedist_max_branch(emb_q2, params)
    
    # Mid-level transforms.
    conv_merged = concatenate([conv_q1_output, conv_q2_output])
    conv_dense_1 = create_dense_block(conv_merged, params['num_dense_1'], params['dense_dropout_rate'])
    conv_dense_2 = create_dense_block(conv_dense_1, params['num_dense_2'], params['dense_dropout_rate'])

    td_merged = concatenate([timedist_q1_output, timedist_q2_output])
    td_dense_1 = create_dense_block(td_merged, params['num_dense_1'], params['dense_dropout_rate'])
    td_dense_2 = create_dense_block(td_dense_1, params['num_dense_2'], params['dense_dropout_rate'])

    # Main dense block.
    merged_main = Concatenate(name='feature_output')([conv_dense_2, td_dense_2])
    dense_main_1 = create_dense_block(merged_main, params['num_dense_1'], params['dense_dropout_rate'])
    dense_main_2 = create_dense_block(dense_main_1, params['num_dense_2'], params['dense_dropout_rate'])
    dense_main_3 = create_dense_block(dense_main_2, params['num_dense_3'], params['dense_dropout_rate'])
    
    output = Dense(
        1,
        kernel_initializer=init_weights,
        bias_initializer=init_bias,
        activation='sigmoid',
        name='target_output',
    )(dense_main_3)
    
    model = Model(
        inputs=[input_q1, input_q2],
        outputs=[output, merged_main],
    )
    
    model.compile(
        loss={'target_output': 'binary_crossentropy', 'feature_output': zero_loss},
        loss_weights={'target_output': 1.0, 'feature_output': 0.0},
        optimizer='nadam',
        metrics=None
    )

    return model

In [None]:
model_params = {
    'num_conv_filters': 32,
    'num_dense_1': 256,
    'num_dense_2': 128,
    'num_dense_3': 100,
    'conv_kernel_size': 3,
    'conv_dropout_rate': 0.25,
    'timedist_dropout_rate': 0.25,
    'dense_dropout_rate': 0.25,
}

In [None]:
feature_output_size = model_params['num_dense_2'] * 2

In [None]:
model_checkpoint_path = aux_data_folder + 'fold-checkpoint-' + feature_list_id + '.h5'

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_train_oofp = np.zeros_like(y_train, dtype='float32')
y_train_oofp_features = np.zeros((len(y_train), feature_output_size), dtype='float32')

In [None]:
y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS), dtype='float32')
y_test_oofp_features = np.zeros((len(X_test_q1), feature_output_size), dtype='float32')

In [None]:
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
    X_fold_train_q1 = X_train_q1[ix_train]
    X_fold_train_q2 = X_train_q2[ix_train]

    X_fold_val_q1 = X_train_q1[ix_val]
    X_fold_val_q2 = X_train_q2[ix_val]

    y_fold_train = y_train[ix_train]
    y_fold_val = y_train[ix_val]
    
    print()
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    print()
    
    model = create_model(model_params)
    model.fit(
        [X_fold_train_q1, X_fold_train_q2],
        [y_fold_train, np.zeros((len(y_fold_train), feature_output_size))],
        
        validation_data=(
            [X_fold_val_q1, X_fold_val_q2],
            [y_fold_val, np.zeros((len(y_fold_val), feature_output_size))],
        ),

        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        verbose=1,
        
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.001,
                patience=3,
                verbose=1,
                mode='auto',
            ),
            ModelCheckpoint(
                model_checkpoint_path,
                monitor='val_loss',
                save_best_only=True,
                verbose=2,
            ),
        ],
    )
        
    # Create out-of-fold prediction.
    model.load_weights(model_checkpoint_path)
    
    y_train_oofp[ix_val], y_train_oofp_features[ix_val] = model.predict(
        [X_train_q1[ix_val], X_train_q2[ix_val]],
        batch_size=1024,
        verbose=1
    )
    
    if fold_num + 1 == NUM_FOLDS:
        y_test_oofp, y_test_oofp_features = model.predict(
            [X_test_q1, X_test_q2],
            batch_size=1024,
            verbose=1
        )
    
    # Clear GPU memory.
    K.clear_session()
    del X_fold_train_q1
    del X_fold_train_q2
    del X_fold_val_q1
    del X_fold_val_q2
    del model
    gc.collect()

In [None]:
cv_score = log_loss(y_train, y_train_oofp)
print('CV score:', cv_score)

## Save feature names

In [None]:
feature_names = [
    'oofp_currie32_cnn_activations',
]

In [None]:
save_feature_names(feature_names, feature_list_id)

## Save Train features

In [None]:
save_feature_list(y_train_oofp_features, 'train', feature_list_id)

## Save Test features

In [None]:
save_feature_list(y_test_oofp_features, 'test', feature_list_id)

## Explore

In [None]:
pd.DataFrame(y_train_oofp_features).describe().T

In [None]:
pd.DataFrame(y_test_oofp_features).describe().T