## Imports

In [None]:
import os
import pprint

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
cuda_use_gpus(0)

In [None]:
from keras.models import Model
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

## Config

In [None]:
EXPERIMENT_ID = 'lystdo-fasttext'

In [None]:
RANDOM_SEED = 42

In [None]:
np.random.seed(RANDOM_SEED)

In [None]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [None]:
embedding_matrix = load(aux_data_folder + 'embedding_weights_fasttext_filtered_no_stopwords.pickle')

In [None]:
X_train_q1 = load(features_data_folder + 'X_train_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_train_q2 = load(features_data_folder + 'X_train_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

In [None]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]

In [None]:
print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)

Mirror the dataset question-wise and append it to the original one

In [None]:
X_train_q1_new = np.vstack([X_train_q1, X_train_q2])
X_train_q2_new = np.vstack([X_train_q2, X_train_q1])

X_train_q1 = X_train_q1_new
X_train_q2 = X_train_q2_new

In [None]:
y_train = np.concatenate([y_train, y_train])

In [None]:
print('X_train_q1:', X_train_q1.shape)
print('X_train_q2:', X_train_q2.shape)
print('y_train   :', y_train.shape)

## Define Models

In [None]:
def create_model(params):
    embedding_layer = Embedding(
        VOCAB_LENGTH,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    )
    lstm_layer = LSTM(
        params['num_lstm'],
        dropout=params['lstm_dropout_rate'],
        recurrent_dropout=params['lstm_dropout_rate'],
    )

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(params['dense_dropout_rate'])(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(params['num_dense'], activation='relu')(merged)
    merged = Dropout(params['dense_dropout_rate'])(merged)
    merged = BatchNormalization()(merged)

    output = Dense(1, activation='sigmoid')(merged)
    
    model = Model(
        inputs=[sequence_1_input, sequence_2_input],
        outputs=output
    )
    
    model.compile(
        loss='binary_crossentropy', 
        optimizer='nadam', 
        metrics=['accuracy']
    )

    return model

In [None]:
def get_model_fingerprint(params):
    return EXPERIMENT_ID + '-lstm-{}-dense-{}-droplstm-{:.3f}-dropdense-{:.3f}'.format(
        params['num_lstm'],
        params['num_dense'],
        params['lstm_dropout_rate'],
        params['dense_dropout_rate'],
    )

## Do a K-Fold Split

In [None]:
kfold = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
folds = [
    (
        (X_train_q1[ix_fold_train], X_train_q2[ix_fold_train], y_train[ix_fold_train]),
        (X_train_q1[ix_fold_val], X_train_q2[ix_fold_val], y_train[ix_fold_val]),        
    )
    for (ix_fold_train, ix_fold_val) in kfold.split(X_train_q1, y_train)
]

## Define Random Search Structure

In [None]:
num_random_search_iterations = 30

In [None]:
search_grid = [
    {
        'num_lstm': np.random.randint(128, 512),
        'num_dense': np.random.randint(50, 250),
        'lstm_dropout_rate': np.random.random_sample() / 2,
        'dense_dropout_rate': np.random.random_sample() / 2,
    }
    for i in range(num_random_search_iterations)
]

## Perform Random Search

In [None]:
model_checkpoint_path = aux_data_folder + 'fold-checkpoint-' + EXPERIMENT_ID + '.h5'

In [None]:
histories = []
best_score = 1e9
best_params = None

# Begin Random Search.
for search_iter, current_params in enumerate(search_grid):
    
    print()
    print('-' * 30, f'Iteration {search_iter + 1} / {num_random_search_iterations}', '-' * 30)
    print(f'Trying parameter combination:')
    pprint.pprint(current_params)
    
    current_iter_val_scores = []

    # Begin K-Fold.
    for fold_num, fold in enumerate(folds):
        X_fold_train_q1, X_fold_train_q2, y_fold_train = fold[0]
        X_fold_val_q1, X_fold_val_q2, y_fold_val = fold[1]

        print()
        print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
        print()

        model = create_model(current_params)
        history = model.fit(
            [X_fold_train_q1, X_fold_train_q2], y_fold_train,
            validation_data=([X_fold_val_q1, X_fold_val_q2], y_fold_val),

            batch_size=2048,
            epochs=200,
            verbose=0,
            
            callbacks=[
                EarlyStopping(
                    monitor='val_loss',
                    min_delta=0.001,
                    patience=3,
                    verbose=1,
                    mode='auto',
                ),
                ModelCheckpoint(
                    model_checkpoint_path,
                    monitor='val_loss',
                    save_best_only=True,
                    verbose=2,
                ),
            ],
        )
        
        best_val_score = min(history.history['val_loss'])
        print(f'Validation score: {best_val_score}')        

        current_iter_val_scores.append(best_val_score)
        histories.append((current_params, best_val_score, history.history))

    # End K-Fold.
    # Save the trained model with the current parameter combination.
    current_iter_avg_score = np.mean(current_iter_val_scores)
    model_save_filename = '{}-random-search-{:.4f}-{}.keras'.format(
        EXPERIMENT_ID,
        current_iter_avg_score,
        get_model_fingerprint(current_params)
    )
    
    if current_iter_avg_score < best_score:
        best_score = current_iter_avg_score
        best_params = current_params
    
    print()
    print('CV score  :', current_iter_avg_score)
    print('Saving as :', model_save_filename)
    model.save(aux_data_folder + model_save_filename)

# End Random Search.
# Print best params and save history.
print()
print('=' * 70)
print('Best CV score:', best_score)
print('Best params:')
pprint.pprint(best_params)

save(histories, aux_data_folder + f'{EXPERIMENT_ID}-random-search-history.pickle')