# Week 3 - Grid Search

In [1]:
import h2o
# for grid search, also need to import h2o.grid
import h2o.grid

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 hour 9 mins
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,9 days
H2O cluster name:,H2O_from_python_megan_p7ovho
H2O cluster total nodes:,1
H2O cluster free memory:,1.348 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [3]:
data = h2o.import_file('http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
data.nrows

43978

In [5]:
train, valid, test = data.split_frame([0.8, 0.1], seed=69)

In [6]:
print('%d/%d/%d' % (train.nrows, valid.nrows, test.nrows))

35255/4272/4451


In [7]:
y = 'IsArrDelayed'
# don't use fields that we know wouldn't be available at prediction time
ignore_fields = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay',
                 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
                 'IsDepDelayed', 'IsArrDelayed', 'ActualElapsedTime']
# create two x's, one for all but ignored and one for the likely good predictors
x_all = [i for i in train.names if i not in ignore_fields]
x_likely = ['Month', 'DayOfWeek', 'UniqueCarrier', 'Origin',
            'Dest', 'Distance', 'Cancelled', 'Diverted']

In [8]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [9]:
# create default model with x_all
m_def = H2OGeneralizedLinearEstimator(family='binomial')
m_def.train(x_all, y, train, validation_frame=valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [10]:
# get value of logloss for this model on validation data
m_def.logloss(valid=True)

0.6230175599834998

In [11]:
# set up the grid search using lambda search, search over params for alpha
# build max of 8 models and run for max of 30 seconds
# use random discrete (instead of cartesian) search
g = h2o.grid.H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family='binomial',
        lambda_search=True
    ),
    hyper_params={
        'alpha': [x * 0.01 for x in range(0, 100)]
    },
    search_criteria={
        'strategy': 'RandomDiscrete',
        'max_models': 8,
        'max_runtime_secs': 30
    }
)
g.train(x_all, y, train, validation_frame=valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [12]:
g

                    alpha  \
0                  [0.96]   
1                  [0.84]   
2    [0.8200000000000001]   
3                  [0.79]   
4                   [0.6]   
5                  [0.51]   
6                  [0.08]   
7                  [0.03]   

                                                      model_ids  \
0  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_4   
1  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_7   
2  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_1   
3  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_5   
4  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_6   
5  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_2   
6  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_8   
7  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_41_model_3   

              logloss  
0  0.5902640428683208  
1   0.591548596791795  
2  0.5917254167886266  
3  0.5919977408143924  
4  0.594010740



In [13]:
# cartesian version - note that you could leave search_criteria off as this is the H2O default
# as this will run for all alphas, scale this back to a smaller number of possible params
g2 = h2o.grid.H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family='binomial',
        lambda_search=True
    ),
    hyper_params={
        'alpha': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 0.99]
    },
    search_criteria={
        'strategy': 'Cartesian'
    }
)
# note trying this with x_likely instead of x_all
g2.train(x_likely, y, train, validation_frame=valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [14]:
g2

      alpha                                                     model_ids  \
0     [0.4]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_3   
1     [0.5]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_4   
2     [0.2]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_2   
3     [0.6]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_5   
4    [0.99]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_7   
5     [0.0]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_1   
6     [0.8]  Grid_GLM_py_3_sid_b1d5_model_python_1581805811022_58_model_6   

              logloss  
0  0.6487155628828557  
1  0.6487174438261994  
2  0.6487198598346858  
3  0.6487283432868978  
4  0.6487570996218839  
5  0.6487590917289415  
6  0.6487697515569714  




Note that the logloss is higher across the board when we reduce the features down from x_all to x_likely