## Imports

In [None]:
import datetime
import gc

In [None]:
import pprint

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

In [None]:
cuda_use_gpus(0)

In [None]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
from sigopt import Connection

## Config

In [None]:
NUM_FOLDS = 5

In [None]:
NUM_EPOCHS = 200

In [None]:
model_checkpoint_path = aux_data_folder + 'fold-checkpoint-sigopt-keras-cnn.h5'

In [None]:
NUM_OPTIMIZATION_ITERATIONS = 50

In [None]:
RANDOM_SEED = 42

In [None]:
SIGOPT_EXPERIMENT_ID = None

In [None]:
SIGOPT_TOKEN = 'YOUR_TOKEN_HERE'

In [None]:
np.random.seed(RANDOM_SEED)

## Read Data

In [None]:
embedding_matrix = load(aux_data_folder + 'embedding_weights_fasttext_filtered_no_stopwords.pickle')

In [None]:
X_train_q1 = load(features_data_folder + 'X_train_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_train_q2 = load(features_data_folder + 'X_train_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [None]:
X_test_q1 = load(features_data_folder + 'X_test_nn_fasttext_q1_filtered_no_stopwords.pickle')
X_test_q2 = load(features_data_folder + 'X_test_nn_fasttext_q2_filtered_no_stopwords.pickle')

In [None]:
magic_feature_lists = [
    'magic_jturkewitz',
    'magic_stas_svd_150',
    'magic_stas_avito',
]

In [None]:
X_train_magic, X_test_magic, _ = load_feature_lists(magic_feature_lists)

In [None]:
X_train_magic = X_train_magic.values
X_test_magic = X_test_magic.values

In [None]:
scaler = StandardScaler()
scaler.fit(np.vstack([X_train_magic, X_test_magic]))
X_train_magic = scaler.transform(X_train_magic)
X_test_magic = scaler.transform(X_test_magic)

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

In [None]:
EMBEDDING_DIM = embedding_matrix.shape[-1]
VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]

## Partition the data

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

## Set up the experiment

In [None]:
conn = Connection(client_token=SIGOPT_TOKEN)

In [None]:
if SIGOPT_EXPERIMENT_ID:
    experiment = conn.experiments(id=SIGOPT_EXPERIMENT_ID).fetch()

else:
    experiment = conn.experiments().create(
        name='CNN over FastText',
        parameters=[
            dict(name='num_dense_1', type='int', bounds=(dict(min=8, max=500))),
            dict(name='num_dense_2', type='int', bounds=(dict(min=8, max=500))),
            dict(name='num_dense_3', type='int', bounds=(dict(min=8, max=500))),
            dict(name='dropout_rate', type='double', bounds=(dict(min=0.0, max=0.75))),
        ],
    )
    print("Created experiment: https://sigopt.com/experiment/" + experiment.id)

In [None]:
def create_model(params):
    init_weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
    init_bias = 'zeros'

    def create_embedding_block():
        input_seq = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedding_seq = Embedding(
            VOCAB_LENGTH,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False,
        )(input_seq)

        output_seq = embedding_seq
        return input_seq, output_seq    

    def create_model_question_conv_branch(input_seq, params):
        conv_1 = Conv1D(
            params['num_conv_filters'],
            kernel_size=params['conv_kernel_size'],
            padding='same',
        )(input_seq)

        bn_1 = BatchNormalization()(conv_1)
        relu_1 = Activation('relu')(bn_1)
        dropout_1 = Dropout(params['conv_dropout_rate'])(relu_1)

        conv_2 = Conv1D(
            params['num_conv_filters'],
            kernel_size=params['conv_kernel_size'],
            padding='same',
        )(dropout_1)

        bn_2 = BatchNormalization()(conv_2)
        relu_2 = Activation('relu')(bn_2)
        dropout_2 = Dropout(params['conv_dropout_rate'])(relu_2)

        flatten = Flatten()(dropout_2)
        output = flatten

        return output
    
    def create_model_question_timedist_max_branch(input_seq, params):
        timedist = TimeDistributed(Dense(EMBEDDING_DIM))(input_seq)
        bn = BatchNormalization()(timedist)
        relu = Activation('relu')(bn)
        dropout = Dropout(params['timedist_dropout_rate'])(relu)

        lambda_max = Lambda(
            lambda x: K.max(x, axis=1),
            output_shape=(EMBEDDING_DIM, )
        )(dropout)

        output = lambda_max
        return output

    def create_dense_block(input_layer, num_units, dropout_rate):
        dense = Dense(
            num_units,
            kernel_initializer=init_weights,
            bias_initializer=init_bias,
        )(input_layer)
        bn = BatchNormalization()(dense)
        relu = Activation('relu')(bn)
        dropout = Dropout(dropout_rate)(relu)
        output = dropout

        return output

    input_q1, emb_q1 = create_embedding_block()
    input_q2, emb_q2 = create_embedding_block()
    
    # Feature extractors.
    conv_q1_output = create_model_question_conv_branch(emb_q1, params)
    conv_q2_output = create_model_question_conv_branch(emb_q2, params)
    
    timedist_q1_output = create_model_question_timedist_max_branch(emb_q1, params)
    timedist_q2_output = create_model_question_timedist_max_branch(emb_q2, params)
    
    # Mid-level transforms.
    conv_merged = concatenate([conv_q1_output, conv_q2_output])
    conv_dense_1 = create_dense_block(conv_merged, params['num_dense_1'], params['dense_dropout_rate'])
    conv_dense_2 = create_dense_block(conv_dense_1, params['num_dense_2'], params['dense_dropout_rate'])

    td_merged = concatenate([timedist_q1_output, timedist_q2_output])
    td_dense_1 = create_dense_block(td_merged, params['num_dense_1'], params['dense_dropout_rate'])
    td_dense_2 = create_dense_block(td_dense_1, params['num_dense_2'], params['dense_dropout_rate'])

    # Magic features.
    magic_input = Input(shape=(X_train_magic.shape[-1], ))
    
    # Main dense block.
    merged_main = concatenate([conv_dense_2, td_dense_2, magic_input])
    dense_main_1 = create_dense_block(merged_main, params['num_dense_1'], params['dense_dropout_rate'])
    dense_main_2 = create_dense_block(dense_main_1, params['num_dense_2'], params['dense_dropout_rate'])
    dense_main_3 = create_dense_block(dense_main_2, params['num_dense_3'], params['dense_dropout_rate'])
    
    output = Dense(
        1,
        kernel_initializer=init_weights,
        bias_initializer=init_bias,
        activation='sigmoid',
    )(dense_main_3)
    
    model = Model(
        inputs=[input_q1, input_q2, magic_input],
        outputs=output,
    )
    
    model.compile(
        loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['accuracy']
    )

    return model

In [None]:
def evaluate_model(candidate_params):
    
    model_params = {
        'num_conv_filters': 32,
        'num_dense_1': candidate_params['num_dense_1'],
        'num_dense_2': candidate_params['num_dense_2'],
        'num_dense_3': candidate_params['num_dense_3'],
        'conv_kernel_size': 3,
        'conv_dropout_rate': candidate_params['dropout_rate'],
        'timedist_dropout_rate': candidate_params['dropout_rate'],
        'dense_dropout_rate': candidate_params['dropout_rate'],
    }
    
    cv_scores = []

    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
        X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])
        X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])
        X_fold_train_magic = np.vstack([X_train_magic[ix_train], X_train_magic[ix_train]])

        X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])
        X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])
        X_fold_val_magic = np.vstack([X_train_magic[ix_val], X_train_magic[ix_val]])

        y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
        y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])

        print()
        print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
        print()

        model = create_model(model_params)
        history = model.fit(
            [X_fold_train_q1, X_fold_train_q2, X_fold_train_magic], y_fold_train,
            validation_data=([X_fold_val_q1, X_fold_val_q2, X_fold_val_magic], y_fold_val),

            batch_size=64,
            epochs=NUM_EPOCHS,
            verbose=1,

            callbacks=[
                EarlyStopping(
                    monitor='val_loss',
                    min_delta=0.001,
                    patience=3,
                    verbose=1,
                    mode='auto',
                ),
                ModelCheckpoint(
                    model_checkpoint_path,
                    monitor='val_loss',
                    save_best_only=True,
                    verbose=2,
                ),
            ],
        )

        best_val_loss = min(history.history["val_loss"])
        cv_scores.append(best_val_loss)
        
        # Clear GPU memory.
        K.clear_session()
        del X_fold_train_q1
        del X_fold_train_q2
        del X_fold_train_magic
        del X_fold_val_q1
        del X_fold_val_q2
        del X_fold_val_magic
        del model
        del history
        gc.collect()
        
        # PATCH: Limit to 1 fold for quicker parameter search.
        break
    
    return -np.mean(cv_scores)

## Run the optimization loop

In [None]:
for i in range(NUM_OPTIMIZATION_ITERATIONS):
    print(f'Iteration {i}')
    suggestion = conn.experiments(experiment.id).suggestions().create()
    
    print('Suggestion: ')
    pprint.pprint(suggestion.assignments)
    
    score = evaluate_model(suggestion.assignments)
    print(f'Score: {score:.6f}')
    print()
    
    conn.experiments(experiment.id).observations().create(
        suggestion=suggestion.id,
        value=score,
    )