# Stacked Ensembles

In [1]:
import h2o
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_232"; OpenJDK Runtime Environment (build 1.8.0_232-8u232-b09-0ubuntu1~19.04.1-b09); OpenJDK 64-Bit Server VM (build 25.232-b09, mixed mode)
  Starting server from /home/megan/Projects/h2oclass/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpb9w9j7k0
  JVM stdout: /tmp/tmpb9w9j7k0/h2o_megan_started_from_python.out
  JVM stderr: /tmp/tmpb9w9j7k0/h2o_megan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,18 days
H2O cluster name:,H2O_from_python_megan_5cie8u
H2O cluster total nodes:,1
H2O cluster free memory:,1.520 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [3]:
# import airlines data
data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# split the data
train, valid, test = data.split_frame([0.8,0.1], seed=69)

In [5]:
# view the split
print("%d/%d/%d" %(train.nrows, valid.nrows, test.nrows))

35255/4272/4451


In [6]:
# setup x and y
y = 'IsArrDelayed'
ignore_fields = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 
                 'SecurityDelay', 'LateAircraftDelay', 'IsDepDelayed', 
                 'IsArrDelayed', 'ActualElapsedTime', 'ArrTime', 'TailNum']
x = [i for i in train.names if i not in ignore_fields]

In [7]:
# stacked ensembles only work with cross validation
# put train and valid back together
train2 = train.rbind(valid)

In [8]:
# for cross validation, set the number of folds we will use
nfolds = 5

In [9]:
# import the estimators we will use for our ensemble
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

In [10]:
# create the GLM model
m_GLM = H2OGeneralizedLinearEstimator(
    family='binomial',
    model_id='glm_def',
    nfolds=nfolds,
    fold_assignment='Modulo',
    keep_cross_validation_predictions=True
)
m_GLM.train(x, y, train2)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [12]:
# create the GBM model
m_GBM = H2OGradientBoostingEstimator(
    model_id='gbm_def',
    nfolds=nfolds,
    fold_assignment='Modulo',
    keep_cross_validation_predictions=True
)
m_GBM.train(x, y, train2)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [13]:
# creat the random forest model
m_RF = H2ORandomForestEstimator(
    model_id='rf_def',
    nfolds=nfolds,
    fold_assignment='Modulo',
    keep_cross_validation_predictions=True
)
m_RF.train(x, y, train2)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
models = [m_GLM.model_id, m_GBM.model_id, m_RF.model_id]

In [15]:
# create the stacked ensemble
m_SE = H2OStackedEnsembleEstimator(
    model_id='SE_glm_gbm_rf',
    base_models=models
)
m_SE.train(x, y, train2)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [16]:
all_models = [m_GLM, m_GBM, m_RF, m_SE]

In [17]:
names = ['GLM', 'GBM', 'RF', 'SE']

In [18]:
pd.Series(map(lambda x: x.logloss(), all_models), names)

GLM    0.573104
GBM    0.508120
RF     0.512384
SE     0.238563
dtype: float64

In [19]:
# this looks suspicious...
pd.Series(map(lambda x: x.auc(), all_models), names)

GLM    0.768711
GBM    0.850491
RF     0.835380
SE     0.990587
dtype: float64

In [20]:
pd.Series(map(lambda x: x.auc(xval=True), all_models), names)

GLM    0.760722
GBM    0.805889
RF     0.839532
SE          NaN
dtype: float64

In [21]:
test_perf = list(map(lambda x: x.model_performance(test), all_models))

In [22]:
# much more realistic
pd.Series(map(lambda x: x.logloss(), test_perf), names)

GLM    0.580376
GBM    0.544807
RF     0.483963
SE     0.479883
dtype: float64

In [23]:
# RF still the best individual model, but SE eeks out a little more in this case
pd.Series(map(lambda x: x.auc(), test_perf), names)

GLM    0.755720
GBM    0.801738
RF     0.846955
SE     0.847356
dtype: float64

In [24]:
# to save mojo
m_GBM.save_mojo()
# note that to save binary, instead use h2o.save_model(m_GBM)

'/home/megan/Projects/h2oclass/gbm_def.zip'

In [25]:
# to save a json file with model details
m_GBM.save_model_details()

'/home/megan/Projects/h2oclass/gbm_def.json'

In [26]:
# to load a model back in, use h2o.load_model('path/to/file')

In [27]:
# now this is implemented (wasn't when class was recorded)
m_SE.save_mojo()

'/home/megan/Projects/h2oclass/SE_glm_gbm_rf.zip'