# Ames housing: xgboost with h2o

Contents
 - start
 - h2o
 
Notes
 - ** h2o does not need onehot for hccv's **

Sources:
http://ww2.amstat.org/publications/jse/v19n3/decock.pdf

Copyright (C) 2018 Alan Chalk  
Please do not distribute or publish without permission.

## Start_.

**packages **

In [None]:
import os
import numpy as np
import pandas as pd
import pickle

import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch

import matplotlib.pyplot as plt
%matplotlib inline

**directories and paths**

In [None]:
# Set directories
print(os.getcwd())
dirRawData = "../input/"
dirPData   = "../PData/"

**Functions**

In [None]:
def fn_MAE(actuals, predictions):
    return np.round(np.mean(np.abs(predictions - actuals)))

def fn_RMSE(actuals, predictions):
    return np.round(np.sqrt(np.mean((predictions - actuals)**2)))

**Load data**

In [None]:
# load df_all (use the none one-hot version)
#df_all = pd.read_hdf(dirPData + '02_df_all.h5', 'df_all')
f_name = dirPData + '02_df.pickle'

with (open(f_name, "rb")) as f:
    dict_ = pickle.load(f)

df_all = dict_['df_all']

del f_name, dict_

In [None]:
# load the variables information
f_name = dirPData + '02_vars.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
var_dep = dict_['var_dep']
vars_ind_numeric = dict_['vars_ind_numeric']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_ind_onehot = dict_['vars_ind_onehot']

del dict_

In [None]:
idx_train  = df_all['fold'].isin(range(6))
idx_val    = df_all['fold'].isin([6, 7])
idx_design = df_all['fold'].isin(range(8))
idx_test   = df_all['fold'].isin([8, 9])

y = df_all[var_dep].values.ravel()
y_train = y[idx_train]
y_val = y[idx_val]
y_design = y[idx_design]
y_test = y[idx_test]

In [None]:
vars_ind = vars_ind_categorical + vars_ind_numeric

### xgboost

In [None]:
h2o.init()
#h2o.connect()

**Send data to h2o**

In [None]:
h2o_df_all = h2o.H2OFrame(df_all[vars_ind + var_dep + ['fold']],
                         destination_frame = 'df_all')

In [None]:
idx_h2o_train  = h2o.H2OFrame(idx_train.astype('int').values)
idx_h2o_val    = h2o.H2OFrame(idx_val.astype('int').values)
idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values)
idx_h2o_test   = h2o.H2OFrame(idx_test.astype('int').values)

**One xgboost model**

 - example with a relatively small number of trees and a large learning rate

In [None]:
param = {
      "ntrees" : 200
    , "max_depth" : 6
    , "learn_rate" : 0.02
    , "sample_rate" : 0.7
    , "col_sample_rate_per_tree" : 0.9
    , "min_rows" : 10
    , "seed": 2019
    #, "feature_fraction_seed": 2019
    , "stopping_metric": 'mae'
    , "stopping_rounds": 20
}

In [None]:
model = H2OXGBoostEstimator(**param)

In [None]:
model.train(x=vars_ind, 
            y='saleprice',
            training_frame=h2o_df_all[idx_h2o_train, :],
            validation_frame=h2o_df_all[idx_h2o_val, :]
            )

In [None]:
h2o.no_progress()

pred_train = model.predict(h2o_df_all[idx_h2o_train, :])
pred_val   = model.predict(h2o_df_all[idx_h2o_val, :])
#pred_test  = model.predict(h2o_df_all[idx_h2o_test, :])

pred_train = pred_train.as_data_frame().values.ravel()
pred_val   = pred_val.as_data_frame().values.ravel()
#pred_test  = pred_test.as_data_frame().values.ravel()

print('train error', fn_MAE(y[idx_train], pred_train))
print('val error',   fn_MAE(y[idx_val],   pred_val))
#print('test error',  fn_MAE(y[idx_test],  pred_test))

h2o.show_progress()

**Train and validation curves**

We can see that more trees are certainly necessary.

In [None]:
df_scoring_history = model.scoring_history()

fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(1, 1, 1)
ax1.plot(df_scoring_history['number_of_trees'],
         df_scoring_history['training_mae'],
         'k-',
         label='training')
ax1.plot(df_scoring_history['number_of_trees'],
         df_scoring_history['validation_mae'],
         'r-',
         label='validation')
ax1.set_xlabel('number of trees', fontsize=20)
ax1.set_ylabel('mae', fontsize=20)
_ = ax1.legend(fontsize=20)
#ax1.set_ylim([0, 50000])

## Random grid search

There are at least 3 hyper-parameters we should look at:
 - sample_rate
 - col_sample_rate_per_tree
 - min_rows
 
If we try 5 of each in a grid search - this is 125 forests to grow and test - which will take a while.

Rather we will randomly sample the grid and limit our compute effort to 10 forests.

How good is random grid search?  See for example http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdfm

In [None]:
max_depth = [5, 6, 7, 8, 9]
sample_rate = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
min_rows = [2, 5, 10, 20, 40]

n_iter = len(max_depth) * len(sample_rate) * len(min_rows)

params = np.array(np.meshgrid(max_depth, sample_rate, min_rows)).reshape(3, n_iter).T

df_results = pd.DataFrame(params, columns = ['max_depth', 'sample_rate', 'min_rows'])
df_results['train_mae'] = np.nan
df_results['val_mae'] = np.nan
df_results['train_rmse'] = np.nan
df_results['val_rmse'] = np.nan


In [None]:
df_results = df_results.sample(frac=1).reset_index(drop=True)

In [None]:
# loop through all and find train and val mae

n_models = 20

for idx in range(n_models):
    
    print(idx, 'of', n_models)
        
    max_depth = int(df_results.iloc[idx]['max_depth'])
    sample_rate = df_results.iloc[idx]['sample_rate']
    min_rows = df_results.iloc[idx]['min_rows']
    
    param = {
      "ntrees" : 2000 # really need something like 2000 for eta 0.002
    , "max_depth" : max_depth
    , "learn_rate" : 0.002
    , "sample_rate" : sample_rate
    , "col_sample_rate_per_tree" : 0.9
    , "min_rows" : min_rows
    , "seed": 2018
    , "score_tree_interval": 100
    , "stopping_metric": 'mae'
    , "stopping_rounds": 20
}
    
    model = H2OXGBoostEstimator(**param)
    
    model.train(x=vars_ind, 
            y='saleprice',
            training_frame=h2o_df_all[idx_h2o_train, :],
            validation_frame=h2o_df_all[idx_h2o_val, :]
            )

    model_pred_train = model.predict(h2o_df_all[idx_h2o_train, :])
    model_pred_val   = model.predict(h2o_df_all[idx_h2o_val, :])
    
    model_pred_train = model_pred_train.as_data_frame().values.ravel()
    model_pred_val   = model_pred_val.as_data_frame().values.ravel()

    df_results['train_mae'].iloc[idx] = fn_MAE(y[idx_train], model_pred_train)
    df_results['val_mae'].iloc[idx]   = fn_MAE(y[idx_val], model_pred_val)
    df_results['train_rmse'].iloc[idx] = fn_RMSE(y[idx_train], model_pred_train)
    df_results['val_rmse'].iloc[idx]   = fn_RMSE(y[idx_val], model_pred_val)


In [None]:
df_results.dropna(axis=0, inplace = True)
df_results

In [None]:
idxmin = df_results['val_mae'].idxmin()
print(df_results.iloc[idxmin])
# output when run last
# max_depth          7.0
# sample_rate        0.6
# min_rows           2.0

# train_mae       8157
# val_mae        14472

 - I don't like the min_rows 2 paramter.  
 - Nor do I believe it (that you can move in a direction defined by two house prices)
 - I would always prefer something which does not overfit the data by too much - and this gap (8000 - 14000) is much too big.
 - The extract below shows that at max_depth of 7, the two runs with similar sample rates (0.6 and 0.8) have similar val_mae but the one with min-rows 10 has much less overfitting. 

In [None]:
df_results[df_results['max_depth'] == 7]

**xgboost with best hyper-parameters**

In [None]:
param = {
      "ntrees" : 2000 
    , "max_depth" : 7
    , "learn_rate" : 0.002
    , "sample_rate" : 0.6
    , "col_sample_rate_per_tree" : 0.9
    , "min_rows" : 40
    , "seed": 2019
    , "score_tree_interval": 100
    , "stopping_metric": 'mae'
    , "stopping_rounds": 20
}

xg_bst = H2OXGBoostEstimator(**param)

xg_bst.train(x=vars_ind, 
             y='saleprice',
             training_frame=h2o_df_all[idx_h2o_design, :]
          )


bst_pred_train = xg_bst.predict(h2o_df_all[idx_h2o_train, :])
bst_pred_val   = xg_bst.predict(h2o_df_all[idx_h2o_val, :])
bst_pred_test  = xg_bst.predict(h2o_df_all[idx_h2o_test, :])

bst_pred_train = bst_pred_train.as_data_frame().values.ravel()
bst_pred_val   = bst_pred_val.as_data_frame().values.ravel()
bst_pred_test  = bst_pred_test.as_data_frame().values.ravel()

print('train error', fn_MAE(y[idx_train], bst_pred_train))
print('val error',   fn_MAE(y[idx_val], bst_pred_val))
print('test error',  fn_MAE(y[idx_test], bst_pred_test))

#       last run gave  RF with leafsize 20
#             xgb          was
#train error 10,073       13,544
#val error    9,944       13,696
#test error  13,079       14,801

**Save the model**

In [None]:
xg_bst_path = h2o.save_model(model=xg_bst, path=dirPData, force=True)

In [None]:
xg_bst_path

In [None]:
xg_bst = h2o.load_model(path = xg_bst_path)

**Save predictions and model information**

In [None]:
# Load predictions file

#store = pd.HDFStore(dirPData + 'predictions.h5')
#df_predictions = pd.read_hdf(store, 'df_predictions')
#store.close()

f_name = dirPData + 'dict_predictions.pickle'

with (open(f_name, "rb")) as f:
    dict_ = pickle.load(f)

df_predictions = dict_['df_predictions']


In [None]:
df_predictions['m_3a']  = xg_bst.predict(h2o_df_all).as_data_frame().values.ravel()
dict_['df_predictions'] = df_predictions
dict_['m_3a'] = 'xgboost, saved as m_3a'
dict_['m_3a_path'] = xg_bst_path

In [None]:
df_predictions.head()

In [None]:
f_name = dirPData + 'dict_predictions.pickle'

with open(f_name, "wb") as f:
    pickle.dump(dict_, f)
    
del f_name, dict_

In [None]:
h2o.cluster().shutdown()