In [1]:
#!pip install requests
#!pip install tabulate
#!pip install scikit-learn
#!pip install colorama
#!pip install future

In [2]:
# The following command removes the H2O module for Python.
#!pip uninstall -y h2o

# Next, use pip to install this version of the H2O Python module.
#!pip install http://h2o-release.s3.amazonaws.com/h2o/rel-weierstrass/2/Python/h2o-3.14.0.2-py2.py3-none-any.whl

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import ShuffleSplit

#h2o
import h2o
import time
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

# Pretty display for notebooks
%matplotlib inline






You can upgrade to the newest version of the module running from the command line
    $ pip2 install --upgrade requests


In [3]:
# Start H2O on your local machine
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...


CalledProcessError: Command '[u'C:\\ProgramData\\Oracle\\Java\\javapath\\java.exe', u'-version']' returned non-zero exit status 1

In [None]:
# Load the datasets
print("Import and Parse training data")
train = h2o.import_file(path='train_aWnotuB.csv', col_types=["time"] + ['factor']+['int']+['enum'])

In [110]:
print("Import and Parse testing data")
test = h2o.import_file(path='test_BdBKkAj.csv', col_types=["time"] + ['factor']+['enum'])

Import and Parse testing data
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [42]:
train.names

[u'DateTime', u'Junction', u'Vehicles', u'ID']

In [43]:
train.head

DateTime,Junction,Vehicles,ID
2015-11-01 00:00:00,1,15,20151101001
2015-11-01 01:00:00,1,13,20151101011
2015-11-01 02:00:00,1,10,20151101021
2015-11-01 03:00:00,1,7,20151101031
2015-11-01 04:00:00,1,9,20151101041
2015-11-01 05:00:00,1,6,20151101051
2015-11-01 06:00:00,1,9,20151101061
2015-11-01 07:00:00,1,8,20151101071
2015-11-01 08:00:00,1,11,20151101081
2015-11-01 09:00:00,1,12,20151101091


<bound method H2OFrame.head of >

In [44]:
test.head

DateTime,Junction,ID
2017-07-01 00:00:00,1,20170701001
2017-07-01 01:00:00,1,20170701011
2017-07-01 02:00:00,1,20170701021
2017-07-01 03:00:00,1,20170701031
2017-07-01 04:00:00,1,20170701041
2017-07-01 05:00:00,1,20170701051
2017-07-01 06:00:00,1,20170701061
2017-07-01 07:00:00,1,20170701071
2017-07-01 08:00:00,1,20170701081
2017-07-01 09:00:00,1,20170701091


<bound method H2OFrame.head of >

In [45]:
train.shape

(48120, 4)

In [46]:
test.shape

(11808, 3)

In [47]:
train.tail()

DateTime,Junction,Vehicles,ID
2017-06-30 14:00:00,4,10,20170630144
2017-06-30 15:00:00,4,14,20170630154
2017-06-30 16:00:00,4,16,20170630164
2017-06-30 17:00:00,4,16,20170630174
2017-06-30 18:00:00,4,17,20170630184
2017-06-30 19:00:00,4,11,20170630194
2017-06-30 20:00:00,4,30,20170630204
2017-06-30 21:00:00,4,16,20170630214
2017-06-30 22:00:00,4,22,20170630224
2017-06-30 23:00:00,4,12,20170630234




In [48]:
train.describe()

Rows:48120
Cols:4




Unnamed: 0,DateTime,Junction,Vehicles,ID
type,time,enum,int,enum
mins,1.446336e+12,,1.0,
mean,1.47426503611e+12,,22.7913341646,
maxs,1.4988636e+12,,180.0,
sigma,15459314347.0,,20.75006252,
zeros,0,,0,
missing,0,0,0,0
0,2015-11-01 00:00:00,1,15.0,20151101001
1,2015-11-01 01:00:00,1,13.0,20151101011
2,2015-11-01 02:00:00,1,10.0,20151101021


In [49]:
test.describe()

Rows:11808
Cols:3




Unnamed: 0,DateTime,Junction,ID
type,time,enum,enum
mins,1.4988672e+12,,
mean,1.504179e+12,,
maxs,1.5094908e+12,,
sigma,3067938126.4,,
zeros,0,,
missing,0,0,0
0,2017-07-01 00:00:00,1,20170701001
1,2017-07-01 01:00:00,1,20170701011
2,2017-07-01 02:00:00,1,20170701021


In [50]:
train.types

{u'DateTime': u'time',
 u'ID': u'enum',
 u'Junction': u'enum',
 u'Vehicles': u'int'}

In [52]:
train.group_by(by=["Junction"]).get_frame()

Junction
1
2
3
4




In [76]:
def refine_date_col(data, col, pattern):
    #data[col]         = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() 
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])


In [77]:
refine_date_col(train, "DateTime", "%Y-%m-%d %H:%M:%S")
train = train.drop("DateTime")
train.describe()

Rows:48120
Cols:10




Unnamed: 0,DateTime,Junction,Vehicles,ID,Day,Month,Year,WeekNum,WeekDay,HourOfDay
type,time,enum,int,enum,int,int,int,int,enum,int
mins,1.446336e+12,,1.0,,1.0,2.0,2015.0,1.0,,0.0
mean,1.47426503611e+12,,22.7913341646,,15.7007481297,6.88428927681,2016.26982544,23.9416458853,,11.5
maxs,1.4988636e+12,,180.0,,31.0,13.0,2017.0,53.0,,23.0
sigma,15459314347.0,,20.75006252,,8.78407263786,3.56987158004,0.616093498737,15.7180656998,,6.92225847985
zeros,0,,0,,0,0,0,0,,2005
missing,0,0,0,0,0,0,0,0,0,0
0,2015-11-01 00:00:00,1,15.0,20151101001,1.0,12.0,2015.0,44.0,Sun,0.0
1,2015-11-01 01:00:00,1,13.0,20151101011,1.0,12.0,2015.0,44.0,Sun,1.0
2,2015-11-01 02:00:00,1,10.0,20151101021,1.0,12.0,2015.0,44.0,Sun,2.0


Rows:48120
Cols:11




Unnamed: 0,Junction,Vehicles,ID,Day,Month,Year,WeekNum,WeekDay,HourOfDay,Weekend,Season
type,enum,int,enum,int,int,int,int,enum,int,int,enum
mins,,1.0,,1.0,2.0,2015.0,1.0,,0.0,0.0,
mean,,22.7913341646,,15.7007481297,6.88428927681,2016.26982544,23.9416458853,,11.5,0.284289276808,
maxs,,180.0,,31.0,13.0,2017.0,53.0,,23.0,1.0,
sigma,,20.75006252,,8.78407263786,3.56987158004,0.616093498737,15.7180656998,,6.92225847985,0.45107994009,
zeros,,0,,0,0,0,0,,2005,34440,
missing,0,0,0,0,0,0,0,0,0,0,4464
0,1,15.0,20151101001,1.0,12.0,2015.0,44.0,Sun,0.0,1.0,Winter
1,1,13.0,20151101011,1.0,12.0,2015.0,44.0,Sun,1.0,1.0,Winter
2,1,10.0,20151101021,1.0,12.0,2015.0,44.0,Sun,2.0,1.0,Winter


In [78]:
train

Junction,Vehicles,ID,Day,Month,Year,WeekNum,WeekDay,HourOfDay,Weekend,Season
1,15,20151101001,1,12,2015,44,Sun,0,1,Winter
1,13,20151101011,1,12,2015,44,Sun,1,1,Winter
1,10,20151101021,1,12,2015,44,Sun,2,1,Winter
1,7,20151101031,1,12,2015,44,Sun,3,1,Winter
1,9,20151101041,1,12,2015,44,Sun,4,1,Winter
1,6,20151101051,1,12,2015,44,Sun,5,1,Winter
1,9,20151101061,1,12,2015,44,Sun,6,1,Winter
1,8,20151101071,1,12,2015,44,Sun,7,1,Winter
1,11,20151101081,1,12,2015,44,Sun,8,1,Winter
1,12,20151101091,1,12,2015,44,Sun,9,1,Winter




In [117]:
refine_date_col(test, "DateTime", "%Y-%m-%d %H:%M:%S")
test = test.drop("DateTime")
test.describe()

Rows:11808
Cols:9




Unnamed: 0,DateTime,Junction,ID,Day,Month,Year,WeekNum,WeekDay,HourOfDay
type,time,enum,enum,int,int,int,int,enum,int
mins,1.4988672e+12,,,1.0,8.0,2017.0,26.0,,0.0
mean,1.504179e+12,,,15.8780487805,9.49593495935,2017.0,35.0,,11.5
maxs,1.5094908e+12,,,31.0,11.0,2017.0,44.0,,23.0
sigma,3067938126.4,,,8.87766527017,1.12170411131,0.0,5.08486462215,,6.92247968532
zeros,0,,,0,0,0,0,,492
missing,0,0,0,0,0,0,0,0,0
0,2017-07-01 00:00:00,1,20170701001,1.0,8.0,2017.0,26.0,Sat,0.0
1,2017-07-01 01:00:00,1,20170701011,1.0,8.0,2017.0,26.0,Sat,1.0
2,2017-07-01 02:00:00,1,20170701021,1.0,8.0,2017.0,26.0,Sat,2.0


Rows:11808
Cols:10




Unnamed: 0,Junction,ID,Day,Month,Year,WeekNum,WeekDay,HourOfDay,Weekend,Season
type,enum,enum,int,int,int,int,enum,int,int,enum
mins,,,1.0,8.0,2017.0,26.0,,0.0,0.0,
mean,,,15.8780487805,9.49593495935,2017.0,35.0,,11.5,0.292682926829,
maxs,,,31.0,11.0,2017.0,44.0,,23.0,1.0,
sigma,,,8.87766527017,1.12170411131,0.0,5.08486462215,,6.92247968532,0.455013367724,
zeros,,,0,0,0,0,,492,8352,
missing,0,0,0,0,0,0,0,0,0,0
0,1,20170701001,1.0,8.0,2017.0,26.0,Sat,0.0,1.0,Autumn
1,1,20170701011,1.0,8.0,2017.0,26.0,Sat,1.0,1.0,Autumn
2,1,20170701021,1.0,8.0,2017.0,26.0,Sat,2.0,1.0,Autumn


In [80]:
train['Season'].table()

Season,Count
Winter,5208
Spring,15024
Summer,10248
Autumn,6624
Winter,6552




In [81]:
train['Year'].table()

Year,Count
2015,4392
2016,26352
2017,17376




In [83]:
test['Season'].table()

Season,Count
Autumn,8832
Winter,2976




# Manual

In [93]:
#manual setting
s = time.time()
  
gbm0 = H2OGradientBoostingEstimator(model_id='gbm_manual',
                                    seed=1234,
                                    ntrees= 100, 
                                    max_depth=6,
                                    col_sample_rate=0.9,
                                    sample_rate=0.9)
    
features = train.names
features.remove("Vehicles")
features.remove("ID")
gbm0.train(x =features,
             y  ="Vehicles",
             training_frame  =train,
             )

gbm_elapsed = time.time() - s

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [102]:
gbm0.summary

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_manual


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 21.2195427995
RMSE: 4.60646749685
MAE: 2.72009918344
RMSLE: 0.226741413531
Mean Residual Deviance: 21.2195427995
Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-11-17 22:57:57,0.077 sec,0.0,20.7498469,15.3889454,430.5561468
,2017-11-17 22:57:57,0.497 sec,1.0,18.9104691,14.0253245,357.6058418
,2017-11-17 22:57:57,0.622 sec,2.0,17.2878303,12.8165553,298.8690754
,2017-11-17 22:57:57,0.763 sec,3.0,15.8438876,11.7353076,251.0287753
,2017-11-17 22:57:57,0.859 sec,4.0,14.5646535,10.7715379,212.1291304
---,---,---,---,---,---,---
,2017-11-17 22:58:00,3.548 sec,96.0,4.6345131,2.7365160,21.4787117
,2017-11-17 22:58:00,3.566 sec,97.0,4.6254569,2.7332339,21.3948516
,2017-11-17 22:58:00,3.584 sec,98.0,4.6197327,2.7293643,21.3419301



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Junction,57224472.0000000,1.0,0.5881395
Year,15632369.0000000,0.2731763,0.1606658
HourOfDay,11760631.0000000,0.2055175,0.1208730
WeekDay,4453750.0,0.0778295,0.0457746
Month,2680045.5,0.0468339,0.0275449
Weekend,1955348.8750000,0.0341698,0.0200966
WeekNum,1812148.5,0.0316674,0.0186248
Season,1173426.0,0.0205057,0.0120602
Day,605254.3125000,0.0105768,0.0062207


<bound method H2OGradientBoostingEstimator.summary of >

In [99]:
test

Junction,Day,Month,Year,WeekNum,WeekDay,HourOfDay,Weekend,Season
1,1,8,2017,26,Sat,0,1,Autumn
1,1,8,2017,26,Sat,1,1,Autumn
1,1,8,2017,26,Sat,2,1,Autumn
1,1,8,2017,26,Sat,3,1,Autumn
1,1,8,2017,26,Sat,4,1,Autumn
1,1,8,2017,26,Sat,5,1,Autumn
1,1,8,2017,26,Sat,6,1,Autumn
1,1,8,2017,26,Sat,7,1,Autumn
1,1,8,2017,26,Sat,8,1,Autumn
1,1,8,2017,26,Sat,9,1,Autumn




In [103]:
preds = gbm0.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


# Manual Setting + Cross-Validation

In [155]:
#manual setting +CV
s = time.time()
  
gbm1 = H2OGradientBoostingEstimator(model_id='gbm_manual1',
                                    seed=1234,
                                    ntrees= 100, 
                                    max_depth=6,
                                    col_sample_rate=0.9,
                                    sample_rate=0.9,
                                    nfolds = 5)
    
features = train.names
features.remove("Vehicles")
features.remove("ID")
gbm1.train(x =features,
             y  ="Vehicles",
             training_frame  =train,
             )

gbm_elapsed = time.time() - s

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [156]:
gbm1

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_manual1


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 21.2195427995
RMSE: 4.60646749685
MAE: 2.72009918344
RMSLE: 0.226741413531
Mean Residual Deviance: 21.2195427995

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 23.1341640793
RMSE: 4.80979875663
MAE: 2.83065988916
RMSLE: 0.236420125625
Mean Residual Deviance: 23.1341640793
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,2.830688,0.0243806,2.8034394,2.8900363,2.8305,2.790543,2.838921
mean_residual_deviance,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
mse,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
r2,0.9461834,0.0041132,0.9442265,0.9360531,0.9522212,0.9512054,0.9472112
residual_deviance,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
rmse,4.8049464,0.1612190,4.83244,5.211481,4.573142,4.604565,4.8031044
rmsle,0.2364073,0.0023127,0.2363034,0.2398918,0.2327701,0.2327893,0.2402822


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-11-17 23:50:35,10.143 sec,0.0,20.7498469,15.3889454,430.5561468
,2017-11-17 23:50:35,10.157 sec,1.0,18.9104691,14.0253245,357.6058418
,2017-11-17 23:50:35,10.170 sec,2.0,17.2878303,12.8165553,298.8690754
,2017-11-17 23:50:35,10.182 sec,3.0,15.8438876,11.7353076,251.0287753
,2017-11-17 23:50:35,10.195 sec,4.0,14.5646535,10.7715379,212.1291304
---,---,---,---,---,---,---
,2017-11-17 23:50:36,11.405 sec,96.0,4.6345131,2.7365160,21.4787117
,2017-11-17 23:50:36,11.419 sec,97.0,4.6254569,2.7332339,21.3948516
,2017-11-17 23:50:36,11.432 sec,98.0,4.6197327,2.7293643,21.3419301



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Junction,57224472.0000000,1.0,0.5881395
Year,15632369.0000000,0.2731763,0.1606658
HourOfDay,11760631.0000000,0.2055175,0.1208730
WeekDay,4453750.0,0.0778295,0.0457746
Month,2680045.5,0.0468339,0.0275449
Weekend,1955348.8750000,0.0341698,0.0200966
WeekNum,1812148.5,0.0316674,0.0186248
Season,1173426.0,0.0205057,0.0120602
Day,605254.3125000,0.0105768,0.0062207




In [125]:
preds = gbm1.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


# manual setting+cv+early stopping

In [157]:
#manual setting +CV
s = time.time()
  
gbm2 = H2OGradientBoostingEstimator(model_id='gbm_manual2',
                                    seed=1234,
                                    ntrees= 100, 
                                    max_depth=6,
                                    col_sample_rate=0.9,
                                    sample_rate=0.9,
                                    nfolds = 5,
                                    stopping_metric='mse',
                                    stopping_rounds=15,
                                    score_tree_interval=1)
    
features = train.names
features.remove("Vehicles")
features.remove("ID")
gbm2.train(x =features,
             y  ="Vehicles",
             training_frame  =train,
             )

gbm_elapsed = time.time() - s

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [158]:
gbm2

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_manual2


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 21.2195427995
RMSE: 4.60646749685
MAE: 2.72009918344
RMSLE: 0.226741413531
Mean Residual Deviance: 21.2195427995

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 23.1341640793
RMSE: 4.80979875663
MAE: 2.83065988916
RMSLE: 0.236420125625
Mean Residual Deviance: 23.1341640793
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,2.830688,0.0243806,2.8034394,2.8900363,2.8305,2.790543,2.838921
mean_residual_deviance,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
mse,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
r2,0.9461834,0.0041132,0.9442265,0.9360531,0.9522212,0.9512054,0.9472112
residual_deviance,23.139494,1.5785824,23.352476,27.159534,20.91363,21.20202,23.069813
rmse,4.8049464,0.1612190,4.83244,5.211481,4.573142,4.604565,4.8031044
rmsle,0.2364073,0.0023127,0.2363034,0.2398918,0.2327701,0.2327893,0.2402822


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-11-17 23:51:34,24.109 sec,0.0,20.7498469,15.3889454,430.5561468
,2017-11-17 23:51:34,24.123 sec,1.0,18.9104691,14.0253245,357.6058418
,2017-11-17 23:51:34,24.138 sec,2.0,17.2878303,12.8165553,298.8690754
,2017-11-17 23:51:34,24.150 sec,3.0,15.8438876,11.7353076,251.0287753
,2017-11-17 23:51:34,24.163 sec,4.0,14.5646535,10.7715379,212.1291304
---,---,---,---,---,---,---
,2017-11-17 23:51:36,26.260 sec,96.0,4.6345131,2.7365160,21.4787117
,2017-11-17 23:51:36,26.274 sec,97.0,4.6254569,2.7332339,21.3948516
,2017-11-17 23:51:36,26.287 sec,98.0,4.6197327,2.7293643,21.3419301



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Junction,57224472.0000000,1.0,0.5881395
Year,15632369.0000000,0.2731763,0.1606658
HourOfDay,11760631.0000000,0.2055175,0.1208730
WeekDay,4453750.0,0.0778295,0.0457746
Month,2680045.5,0.0468339,0.0275449
Weekend,1955348.8750000,0.0341698,0.0200966
WeekNum,1812148.5,0.0316674,0.0186248
Season,1173426.0,0.0205057,0.0120602
Day,605254.3125000,0.0105768,0.0062207




# Grid search

In [161]:
#manual setting +CV
s = time.time()

from h2o.grid.grid_search import H2OGridSearch

search_criteria = {'strategy':'Cartesian'}

hyper_parameters = {'sample_rate':[0.7,0.8,0.9], 'col_sample_rate':[0.7,0.8,0.9]}
  
gbm3 = H2OGridSearch(
            H2OGradientBoostingEstimator(
                                    model_id='gbm_manual',
                                    seed=1234,
                                    ntrees= 100, 
                                    max_depth=6,
                                    nfolds = 5,
                                    stopping_metric='mse',
                                    stopping_rounds=15,
                                    score_tree_interval=1),
                                    search_criteria = search_criteria,
                                   hyper_params = hyper_parameters)
    
features = train.names
features.remove("Vehicles")
features.remove("ID")
gbm3.train(x =features,
             y  ="Vehicles",
             training_frame  =train,
             )

gbm_elapsed = time.time() - s

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [165]:
grid_sorted = gbm3.get_grid(sort_by='mse',decreasing=False)
print(grid_sorted)

    col_sample_rate sample_rate  \
0               0.9         0.8   
1               0.9         0.7   
2               0.9         0.9   
3               0.8         0.8   
4               0.8         0.7   
5               0.7         0.7   
6               0.8         0.9   
7               0.7         0.9   
8               0.7         0.8   

                                                      model_ids  \
0  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_5   
1  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_2   
2  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_8   
3  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_4   
4  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_1   
5  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_0   
6  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_7   
7  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_6   
8  Grid_GBM_py_50_sid_8c9e_model_python_151097

In [167]:
best = grid_sorted.model_ids[0]
best_gbm_full_grid = h2o.get_model(best)
best_gbm_full_grid.summary

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_50_sid_8c9e_model_python_1510970162667_1_model_5


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 21.5639250793
RMSE: 4.64369735009
MAE: 2.7345022982
RMSLE: 0.227518807972
Mean Residual Deviance: 21.5639250793

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 23.0762322182
RMSE: 4.80377270676
MAE: 2.83058653794
RMSLE: 0.235739813754
Mean Residual Deviance: 23.0762322182
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,2.8305779,0.0194763,2.8078384,2.8784323,2.8418577,2.8024662,2.822295
mean_residual_deviance,23.08123,1.3991913,23.30821,26.668571,20.99138,21.585567,22.852427
mse,23.08123,1.3991913,23.30821,26.668571,20.99138,21.585567,22.852427
r2,0.9463232,0.0037104,0.9443322,0.9372091,0.9520435,0.9503227,0.9477085
residual_deviance,23.08123,1.3991913,23.30821,26.668571,20.99138,21.585567,22.852427
rmse,4.8000207,0.1432356,4.8278575,5.164162,4.581635,4.646027,4.7804213
rmsle,0.2357301,0.0020395,0.2364380,0.2381348,0.2324560,0.2323051,0.2393164


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-11-18 00:01:11,2 min 34.035 sec,0.0,20.7498469,15.3889454,430.5561468
,2017-11-18 00:01:11,2 min 34.047 sec,1.0,18.9104734,14.0262473,357.6060054
,2017-11-18 00:01:11,2 min 34.058 sec,2.0,17.2904203,12.8189753,298.9586352
,2017-11-18 00:01:11,2 min 34.070 sec,3.0,15.8462111,11.7368641,251.1024068
,2017-11-18 00:01:11,2 min 34.083 sec,4.0,14.5661720,10.7729176,212.1733671
---,---,---,---,---,---,---
,2017-11-18 00:01:12,2 min 35.192 sec,96.0,4.6771586,2.7539841,21.8758125
,2017-11-18 00:01:12,2 min 35.204 sec,97.0,4.6734806,2.7522946,21.8414206
,2017-11-18 00:01:12,2 min 35.216 sec,98.0,4.6555918,2.7424399,21.6745354



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Junction,54827112.0000000,1.0,0.6038350
Year,13774783.0000000,0.2512404,0.1517077
HourOfDay,10738238.0000000,0.1958563,0.1182649
WeekDay,3181119.7500000,0.0580209,0.0350351
Month,2663746.7500000,0.0485845,0.0293370
Weekend,2570627.0,0.0468861,0.0283114
WeekNum,1648819.2500000,0.0300731,0.0181592
Season,877721.1250000,0.0160089,0.0096667
Day,515999.3750000,0.0094114,0.0056829


<bound method H2OGradientBoostingEstimator.summary of >

In [None]:
best_gbm_full_grid.model_performance(X_test)

In [168]:
preds = best_gbm_full_grid.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [170]:
preds

predict
62.2133
54.8503
44.563
36.8876
30.8197
29.2962
31.3513
34.728
37.9277
43.1146




In [169]:
submission = pd.concat((h2o.as_list(test['ID']), h2o.as_list(preds)), axis=1, ignore_index=True)
submission.columns = ['ID', 'Vehicles']
submission = submission.set_index(['ID', 'Vehicles'])
submission.to_csv('submission_h2o_gbm+5fold+earlystopping+gridsearch.csv')

In [3]:
h2o.shutdown()

    >>> h2o.shutdown()
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_a8ac closed.
