# Importing required libraries

In [2]:
# Load Keras libraries used in this example

import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.utils import plot_model
from keras.models import load_model

Using TensorFlow backend.


In [3]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from hyperas.utils import space_eval

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random as rn
from keras import backend as K
from sklearn.model_selection import train_test_split

import os
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# Modelling

In [5]:
def data():
    
    '''
    Data providing function:

    Make sure to have every relevant import statement included here and return data as
    used in model function below. This function is separated from model() so that hyperopt
    won't reload data for each evaluation run.
    '''
    
    # Importing Libraries for Local Scope
    import boto3
    
    # UDFs
    
    # Data fetch
    def fetch_data_froms3(url,aws_access_key_id,aws_secret_access_key):

        client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

        bucket_name = url.split('/')[-2] #'test-house-pred'
        object_key = url.split('/')[-1] #'house_data.csv' 

        csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
        body = csv_obj['Body']
        csv_string = body.read().decode('utf-8')

        df = pd.read_csv(StringIO(csv_string),index_col=0)

        return df
    
    # Train Test Split    
    def split_train_test(df,test_pct,label_col):
    
        train,test = train_test_split(df,test_size=test_pct, random_state=42)

        return train,test
    
    # from url
    url = 'https://s3.us-east-2.amazonaws.com/test-house-pred/house_data.csv'
    label_col = 'price'
    
    data = fetch_data_froms3(url=url,aws_access_key_id='AKIAJVVW75JRYXQ63PUA',aws_secret_access_key='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')
    
    train,test = split_train_test(df=data,test_pct=0.25,label_col=label_col)
    
    x_train = train.loc[:, train.columns != label_col]
    y_train = train[label_col]
    
    x_test = test.loc[:, test.columns != label_col]
    y_test = test[label_col]
    
    # Pre processing
    
    # Calculating column wise mean and std
    mean = x_train.mean(axis=0)
    std = x_train.std(axis=0)

    x_train = (x_train - mean) / std
    x_test = (x_test - mean) / std
    
    return x_train, y_train, x_test, y_test #these names must be same as the args of create model. Else x_train not defined error is thrown

In [6]:
def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()
    model.add(Dense({{choice([32, 64, 128, 256,512])}}, input_shape=(x_train.shape[1],)))
    model.add(Activation({{choice(['relu', 'sigmoid'])}}))
    model.add(Dropout({{uniform(0, 1)}}))

    # If we choose 'two_hidden', add an additional layer
    if {{choice(['one_hidden', 'two_hidden'])}} == 'two_hidden':
        model.add(Dense({{choice([32, 64, 128, 256,512])}}))
        model.add(Activation({{choice(['relu', 'sigmoid'])}}))
        model.add(Dropout({{uniform(0, 1)}}))
    
    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss='mse', metrics=['mae'],
                  optimizer={{choice(['rmsprop', 'adam', 'nadam','sgd'])}})
    
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience={{choice([10,20,30,40,50])}})
    
    model.fit(x_train, y_train,
              batch_size={{choice([16,32,64,128])}},
              epochs=500,
              callbacks=[early_stop],
              verbose=0,
              validation_split=0.2)
#               validation_data=(x_test, y_test))

    [loss, mae] = model.evaluate(x_test, y_test, verbose=0)
#     print('loss:',loss)
#     print('mae:',mae)
    
    # In cases where the loss turns out to be nan (due to bad network architecture)
    # An Assertion error is raised by hyperopt. Because of the nan value of loss.
    # So, to avoid such a case, we update loss to infinity in that case.
    if(np.isnan(mae)):
        print('nan loss')
        return {'loss': np.inf, 'status': STATUS_OK, 'model': model}
    
    print("Testing set Mean Abs Error: {:7.2f}".format(mae))
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [7]:
def get_best_model():
    
    X_train, Y_train, X_test, Y_test = data()
    
    trials=Trials()
    best_run, best_model, space = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=10, #check how to pass maximum value
                                          trials=trials,
                                         notebook_name='5_automated_pipeline_using_hyperas',
                                          eval_space = True, #gives actual values of params in best run instead of list indeces
                                          verbose=False,
                                          return_space=True) #returns the search space of hyperopt. Can be used To display all the trails.
    
    print("\n\nEvalutation of best performing model:")
    print(best_model.evaluate(X_test, Y_test, verbose=0))
    print("\n\nBest performing model chosen hyper-parameters:")
    print(best_run)
    
    return best_model, trials, space

In [8]:
if __name__ == '__main__':
    best_model, trials, space = get_best_model()

Testing set Mean Abs Error:    3.43
Testing set Mean Abs Error:    3.66
Testing set Mean Abs Error:    3.53
Testing set Mean Abs Error:   16.49
Testing set Mean Abs Error:    3.14
Testing set Mean Abs Error: 769785225395184861184.00
Testing set Mean Abs Error:    3.50
Testing set Mean Abs Error:    4.48
Testing set Mean Abs Error:    3.53
Testing set Mean Abs Error:    4.14


Evalutation of best performing model:
[27.84646812198669, 3.1361576572177916]


Best performing model chosen hyper-parameters:
{'Activation': 'relu', 'Activation_1': 'relu', 'Dense': 256, 'Dense_1': 128, 'Dropout': 0.587606728324542, 'Dropout_1': 'two_hidden', 'Dropout_2': 0.2330896882313117, 'batch_size': 128, 'optimizer': 'rmsprop', 'patience': 40}


#### The url and label column cannot be passed as arguments to the get best model function. This is because Hyperas doesn't support arguments to be passed to the data function. So, all the code will be shifted to Hyperopt.