In [4]:
# Install
# pip install h2o
# conda install openjdk

import numpy as np
import pandas as pd
import datetime
import h2o
from h2o.automl import H2OAutoML

In [5]:
# DATA
df = pd.read_csv('https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/nyc_energy_consumption.csv')
df.head()

Unnamed: 0,timeStamp,demand,precip,temp
0,2012-01-01 00:00:00,4937.5,0.0,46.13
1,2012-01-01 01:00:00,4752.1,0.0,45.89
2,2012-01-01 02:00:00,4542.6,0.0,45.04
3,2012-01-01 03:00:00,4357.7,0.0,45.03
4,2012-01-01 04:00:00,4275.5,0.0,42.61


In [6]:
# Processing
df['timeStamp'] = pd.to_datetime(df['timeStamp'])
# demand missing from 2017-08-10
df = df[df['timeStamp']<'2017-08-10']

# Resample
df = df.set_index('timeStamp')
df_daily = df[['demand', 'precip']].resample('D').sum()
temp = df.groupby(df.index.date)['temp'].agg(min_temp = 'min', max_temp = 'max', avg_temp = 'mean')
df_daily = df_daily.join(temp)
df_daily = df_daily.reset_index()

# Treat na
print('Initial na', df_daily.isna().sum().sum())
df_daily = df_daily.fillna(method='ffill')
print('Current na', df_daily.isna().sum().sum())
df_daily.head()

Initial na 3
Current na 0


Unnamed: 0,timeStamp,demand,precip,min_temp,max_temp,avg_temp
0,2012-01-01,118916.0,0.0597,38.78,50.77,46.51
1,2012-01-02,127270.9,0.0,33.57,49.78,40.496667
2,2012-01-03,146292.3,0.0,16.47,33.14,26.6725
3,2012-01-04,152070.4,0.0,13.4,31.83,20.585
4,2012-01-05,147125.9,0.0,27.31,39.16,33.5775


In [7]:
# Time features
df_daily['year'] = df_daily['timeStamp'].dt.year
df_daily['month'] = df_daily['timeStamp'].dt.month
df_daily['weekday'] = df_daily['timeStamp'].dt.dayofweek+1
df_daily['week'] = df_daily['timeStamp'].dt.isocalendar().week
df_daily.head()

Unnamed: 0,timeStamp,demand,precip,min_temp,max_temp,avg_temp,year,month,weekday,week
0,2012-01-01,118916.0,0.0597,38.78,50.77,46.51,2012,1,7,52
1,2012-01-02,127270.9,0.0,33.57,49.78,40.496667,2012,1,1,1
2,2012-01-03,146292.3,0.0,16.47,33.14,26.6725,2012,1,2,1
3,2012-01-04,152070.4,0.0,13.4,31.83,20.585,2012,1,3,1
4,2012-01-05,147125.9,0.0,27.31,39.16,33.5775,2012,1,4,1


In [8]:
# Train/test split
train = df_daily[df_daily['timeStamp']<='2017-06-30']
test = df_daily[df_daily['timeStamp']>'2017-06-30']
print(train.shape)
print(test.shape)

(2008, 10)
(40, 10)


## h2o

In [10]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (build 11.0.6+8-b765.1, mixed mode)
  Starting server from C:\Users\afotina\Anaconda3\envs\ts_automl_env\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\afotina\AppData\Local\Temp\tmpltx4ofkw
  JVM stdout: C:\Users\afotina\AppData\Local\Temp\tmpltx4ofkw\h2o_afotina_started_from_python.out
  JVM stderr: C:\Users\afotina\AppData\Local\Temp\tmpltx4ofkw\h2o_afotina_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.7
H2O_cluster_version_age:,4 days
H2O_cluster_name:,H2O_from_python_afotina_k79wlz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.961 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [11]:
# Create h2o dataframes
hf_train = h2o.H2OFrame(train)
hf_test = h2o.H2OFrame(test)
hf_train.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:2008
Cols:10




Unnamed: 0,timeStamp,demand,precip,min_temp,max_temp,avg_temp,year,month,weekday,week
type,time,real,real,real,real,real,int,int,int,int
mins,1325376000000.0,87914.3,0.0,0.33,15.62,9.015,2012.0,1.0,1.0,1.0
mean,1412078400000.0,144959.5819855576,0.08386015936254966,48.72722609561756,61.65966633466135,55.064279524978964,2014.2704183266917,6.2495019920318855,3.999003984063745,25.42729083665339
maxs,1498780800000.0,243918.7,2.7169,83.23,97.26,89.60666666666667,2017.0,12.0,7.0,53.0
sigma,50095064724.58141,23120.937571071627,0.2186346457158585,16.759256931604337,17.694447439593798,17.028534121942798,1.6003491847131284,3.4407215484961915,2.0004979459805785,15.043623403727567
zeros,0,0,1159,0,0,0,0,0,0,0
missing,0,0,0,0,0,0,0,0,0,0
0,2012-01-01 00:00:00,118916.0,0.0597,38.78,50.77,46.51,2012.0,1.0,7.0,52.0
1,2012-01-02 00:00:00,127270.9,0.0,33.57,49.78,40.49666666666666,2012.0,1.0,1.0,1.0
2,2012-01-03 00:00:00,146292.3,0.0,16.47,33.14,26.6725,2012.0,1.0,2.0,1.0


In [12]:
# Col names
y = 'demand'
X = hf_train.columns
X.remove(y)

In [13]:
aml = H2OAutoML(max_runtime_secs = 600,
                seed = 42)

aml.train(x = X, 
          y = y,
          training_frame = hf_train,
          leaderboard_frame = hf_test)

AutoML progress: |
16:09:08.904: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [14]:
aml.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_AutoML_20210907_160908,50095200.0,7077.8,50095200.0,5135.41,0.0416947
GBM_grid__1_AutoML_20210907_160908_model_9,50209200.0,7085.85,50209200.0,5117.32,0.0417744
StackedEnsemble_AllModels_AutoML_20210907_160908,50288200.0,7091.41,50288200.0,5097.51,0.0418388
GBM_3_AutoML_20210907_160908,50587900.0,7112.52,50587900.0,5277.67,0.042074
GBM_grid__1_AutoML_20210907_160908_model_21,50607100.0,7113.87,50607100.0,5493.88,0.0424369
GBM_grid__1_AutoML_20210907_160908_model_23,51280300.0,7161.03,51280300.0,5564.32,0.042284
GBM_2_AutoML_20210907_160908,51582100.0,7182.07,51582100.0,5124.84,0.0421547
GBM_grid__1_AutoML_20210907_160908_model_24,53325300.0,7302.42,53325300.0,5196.11,0.0432058
GBM_grid__1_AutoML_20210907_160908_model_16,54919000.0,7410.74,54919000.0,5375.82,0.0436438
GBM_grid__1_AutoML_20210907_160908_model_20,55245800.0,7432.75,55245800.0,5454.0,0.0436445




In [15]:
leader_model = aml.leader
leader_model

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20210907_160908

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 9483537.522413915
RMSE: 3079.5352770205304
MAE: 1946.3645031591766
RMSLE: 0.02116806770831739
R^2: 0.982250921095122
Mean Residual Deviance: 9483537.522413915
Null degrees of freedom: 2007
Residual degrees of freedom: 2004
Null deviance: 1072897554124.413
Residual deviance: 19042943345.00714
AIC: 37967.11361503487

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 25868396.637355495
RMSE: 5086.098370790275
MAE: 3257.429644333197
RMSLE: 0.03423445941169222
R^2: 0.9515855542328988
Mean Residual Deviance: 25868396.637355495
Null degrees of freedom: 2007
Residual degrees of freedom: 2004
Null deviance: 1074397838404.2074
Residual deviance: 51943740447.80984
AIC: 39982.07055906461




In [18]:
pred = leader_model.predict(hf_test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [33]:
rmse_valid = np.sqrt(((pred-hf_test['demand'])**2).mean()[0])
mape_valid = (abs((pred-hf_test['demand'])/hf_test['demand'])).mean()
print('h2o model validation results:')
print('RMSE_valid', rmse_valid)
print('mape_valid', mape_valid)

h2o model validation results:
RMSE_valid 7077.796980629294
mape_valid [0.03031913755113745]
