In [2]:
import h2o
import h2ohyperopt
h2o.init()

0,1
H2O cluster uptime:,3 seconds 593 milliseconds
H2O cluster version:,3.8.3.3
H2O cluster name:,H2O_started_from_python_abhishek_zej259
H2O cluster total nodes:,1
H2O cluster total free memory:,1.76 GB
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


### Data Processing
The test dataset used for demonstrating the capabilities of H2OHyperopt is the titanic dataset. The function ```data()``` is used to preprocess the dataset.

In [3]:
def data():
    """
    Function to process the example titanic dataset.
    Train-Valid-Test split is 60%, 20% and 20% respectively.
    Output
    ---------------------
    trainFr: Training H2OFrame.
    testFr: Test H2OFrame.
    validFr: Validation H2OFrame.
    predictors: List of predictor columns for the Training frame.
    response: String defining the response column for Training frame.
    """
    titanic_df = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
    columns_to_be_used = ['pclass', 'age', 'sex', 'sibsp', 'parch', 'ticket',
                          'embarked', 'fare', 'survived']
    columns_to_factorize = ['pclass', 'sex', 'sibsp', 'embarked', 'survived']
    # Factorizing the columns in the columns_to_factorize list
    for col in columns_to_factorize:
        titanic_df[col] = titanic_df[col].asfactor()
    # Selecting only the columns we need
    titanic_frame = titanic_df[columns_to_be_used]
    trainFr, testFr, validFr = titanic_frame.split_frame([0.6, 0.2],
                                                         seed=1234)
    predictors = trainFr.names[:]
    # Removing the response column from the list of predictors
    predictors.remove('survived')
    response = 'survived'
    return trainFr, testFr, validFr, predictors, response

In [4]:
trainFr, testFr, validFr, predictors, response = data()


Parse Progress: [##################################################] 100%


### Mutiple Model Type Based Optimization
Let us demonstrate the ModelDocker. Since the problem is a binary classification problem, we specify the metric to AUC. Docking three types of models - GBM's, GLM's and DLE's,

In [5]:
model_gbm = h2ohyperopt.GBMOptimizer(metric='auc')
# To use the default search space
# model_gbm.select_optimization_parameters("Default")
# To use a combination of Default parameters and the customized parameters.
model_gbm.select_optimization_parameters({'col_sample_rate': 'Default',
                                          'ntrees': 200,
                                          'learn_rate': ('uniform',(0.05, 0.2)),
                                          'nfolds': 7})

In [6]:
model_dle = h2ohyperopt.DLEOptimizer(metric='auc')
# Selecting parameters to optimize on
model_dle.select_optimization_parameters({'epsilon': 'Default',
                                          'adaptive_rate': True,                                           
                                          'hidden': ('choice', [[10, 20], [30, 40]]),
                                          'nfolds':7})

In [7]:
model_glm = h2ohyperopt.GLMOptimizer(metric='auc', problemType='Classification')
# Selecting default parameters to optimize on
model_glm.select_optimization_parameters('Default')

### Starting the Optimization

The optimization is started using the function ```start_optimization```. It is necessary to provide a validation frame so that the optimization algorithm can evaluate the model.

In [8]:
docker = h2ohyperopt.ModelDocker([model_dle, model_gbm, model_glm], 'auc')                                     
docker.start_optimization(num_evals=20, trainingFr=trainFr,
                          validationFr=validFr, response=response,                                              
                          predictors=predictors)


glm Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

glm Model Build Progress: [############################

### Metrics available to User

In [9]:
#To access the best model
bestmodel = docker.best_model

In [10]:
# To get the best model parameters
docker.best_model_parameters()

('H2OGradientBoostingEstimator',
 {'col_sample_rate': 0.6783596802934229,
  'learn_rate': 0.058252049654680824,
  'metric': 'auc',
  'nfolds': 7,
  'ntrees': 200})

In [11]:
# To get the training and validation scores
docker.best_model_scores()

The training loss metric(auc) is : 0.995106776992
The validation loss metric(auc) is : 0.855424871151


### Ensembling

Three ensemble methods provided for the user.<br \>
1. Use the best n models from evaluated models
2. Use the best n models from each model class
3. Smart ensembling using correlation

In [12]:
%%capture
#To use best in class ensembling
docker.best_in_class_ensembles()

In [13]:
testDockerScore = docker.score_ensemble(testFr)


glm prediction Progress: [##################################################] 100%

gbm prediction Progress: [##################################################] 100%

deeplearning prediction Progress: [##################################################] 100%


In [14]:
testDockerScore

0.7503803888419274

In [15]:
%%capture
#To use smart ensembling
docker.smart_ensembling()

AttributeError: ModelDocker instance has no attribute 'smart_ensembling'