In [None]:
from __future__ import print_function
import os
import sys
import gzip
import numpy as np
import pandas as pd


from keras import backend as K
from keras.layers import Input, Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten
from keras import optimizers
from keras.optimizers import SGD, Adam, RMSprop
from keras.models import Sequential, Model, model_from_json, model_from_yaml
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau


from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler



### Now you define a few variables that could change as you attempt to optimize your model. 

### Often, these are just hard coded, or else provided as command line parameters once you know what variables you might be interested in varying.

### Instead, we use a method to initialize these variables from either a config file or from command line parameters. This method is called by CANDLE.


In [None]:
import param_utils as p_utils
def initialize_parameters():

    # Get command-line parameters
    parser = p_utils.get_nt3_parser()
    args = parser.parse_args()

    # Get parameters from configuration file
    fileParameters = p_utils.read_config_file(args.config_file)

    # Consolidate parameter set. Command-line parameters overwrite file configuration
    gParameters = p_utils.args_overwrite_config(args, fileParameters)
    return gParameters

# HACK needed to parse command line params in notebook
import sys; sys.argv=['']; del sys

gParameters = initialize_parameters()
print(gParameters)


In [None]:
# Define the data
url_nt3 = gParameters['data_url']
FILE_TRAIN = url_nt3 + gParameters['train_data']
FILE_TEST = url_nt3  + gParameters['test_data']

# Define the reference model
CLASSES = gParameters['classes']
DROPOUT_RATE = gParameters['drop']

# Define optimizer
OPTIMIZER=gParameters['optimizer']
LEARNING_RATE = gParameters['learning_rate']
DECAY_RATE = gParameters['decay_rate']

# Compile the model
METRICS=gParameters['metrics']
LOSS='categorical_crossentropy'

# Train the model (the optimized model has a default of 400 epochs)
EPOCHS = gParameters['epochs']
BATCH_SIZE = gParameters['batch_size']

# Set up some variables for output files
MODEL_NAME = gParameters['model_name']
OUTPUT_DIR = gParameters['save']

### Now that you've set up your initial variables, it's time to load the data.

In [None]:
def load_data(train_path, test_path):

    import threading
    import queue
    import sys
    
    def load_train(train_path, queue):
        sys.stdout.write('looking for '+ train_path + '\n')
        sys.stdout.flush()
        df_train = (pd.read_csv(train_path,header=None).values).astype('float32')
        sys.stdout.write('done loading training data\n')
        sys.stdout.flush()
        queue.put(df_train)
    
    def load_test(test_path, queue):
        sys.stdout.write('looking for ' + test_path + '\n')
        sys.stdout.flush()
        df_test = (pd.read_csv(test_path,header=None).values).astype('float32')
        sys.stdout.write('done loading test data\n')
        sys.stdout.flush()
        queue.put(df_test)

    q1 = queue.Queue()
    q2 = queue.Queue()
    
    thread1 = threading.Thread(name='load_train', target=load_train, args=(train_path, q1,))
    thread2 = threading.Thread(name='load_test' , target=load_test, args=(test_path, q2,))
    
    thread1.start()
    thread2.start()
    
    thread1.join()
    thread2.join()
    
    df_train = q1.get()
    df_test = q2.get()
    
    print('df_train shape:', df_train.shape)
    print('df_test shape:', df_test.shape)

    seqlen = df_train.shape[1]

    df_y_train = df_train[:,0].astype('int')
    df_y_test = df_test[:,0].astype('int')

    # Convert a class vector (integers) to binary class matrix.
    Y_train = np_utils.to_categorical(df_y_train,CLASSES)
    Y_test = np_utils.to_categorical(df_y_test,CLASSES)

    df_x_train = df_train[:, 1:seqlen].astype(np.float32)
    df_x_test = df_test[:, 1:seqlen].astype(np.float32)

    X_train = df_x_train
    X_test = df_x_test

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    return X_train, Y_train, X_test, Y_test

### This alows the code to executed through the run method as an imported package.

In [None]:
def run(gParameters)

    X_train, Y_train, X_test, Y_test = load_data(FILE_TRAIN, FILE_TEST)
    # this reshaping is critical for the Conv1D to work
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    num_params = X_train.shape[1]

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    print('Number of parameters: ', num_params)

    # Define the reference model
    model = Sequential()
    model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(num_params, 1)))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid'))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=10))
    model.add(Flatten())
    model.add(Dense(200))
    model.add(Activation('relu'))
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(20))
    model.add(Activation('relu'))
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(CLASSES))
    model.add(Activation('softmax'))
    
    # Define the optimizer
    optimizer = optimizers.SGD(lr=LEARNING_RATE, decay=DECAY_RATE)
    
    # Compile the model

    model.summary()
    model.compile(loss=LOSS,
              optimizer=optimizer,
              metrics=[METRICS])
    
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    csv_logger = CSVLogger('{}/training.log'.format(OUTPUT_DIR))


    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0)
    history = model.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    verbose=1,
                    validation_data=(X_test, Y_test),
                    callbacks = [csv_logger, reduce_lr
                                ])

    score = model.evaluate(X_test, Y_test, verbose=0)
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("{}/{}.model.json".format(OUTPUT_DIR, MODEL_NAME), "w") as json_file:
            json_file.write(model_json)
    print('Saved model to disk')

    # serialize weights to HDF5
    model.save_weights("{}/{}.model.h5".format(OUTPUT_DIR, MODEL_NAME))
    print('Saved weights to disk')

### This allows the code to be executed at the command line.


In [None]:
def main():

    gParameters = initialize_parameters()
    run(gParameters)

if __name__ == '__main__':
    main()
    try:
        K.clear_session()
    except AttributeError:      # theano does not have this function
        pass