# PeakGuard H2O AutoML Training Pipeline

This notebook trains a global next-hour forecaster using H2O-3 AutoML on the SQLite readings.
It saves the trained model artifacts (H2O model) and logs metrics into SQLite for the dashboard.

Requirements:
- pip/poetry dependency: `h2o`
- Local Java runtime (H2O requires a JVM)



In [10]:
import os, sqlite3, json, shutil
import numpy as np, pandas as pd
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path

# H2O
import h2o
from h2o.automl import H2OAutoML

# Paths
def resolve_project_root():
    p = Path.cwd().resolve()
    for _ in range(6):
        if (p / 'pyproject.toml').exists() or (p / 'app').exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return Path.cwd().resolve()

PROJECT_ROOT = resolve_project_root()
DB_PATH = os.environ.get('DB_PATH', str(PROJECT_ROOT / 'data' / 'peakguard.db'))
ART_DIR = str(PROJECT_ROOT / 'artifacts')
VERSIONS_DIR = str(Path(ART_DIR) / 'versions')
LATEST_DIR = str(Path(ART_DIR) / 'latest' / 'h2o')
os.makedirs(VERSIONS_DIR, exist_ok=True)
os.makedirs(LATEST_DIR, exist_ok=True)

# Start H2O
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,10 mins 24 secs
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 14 days
H2O_cluster_name:,H2O_from_python_andressalguero_tmwid7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8.62 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [12]:
# Load device metadata and readings from SQLite

def read_sql_df(query, params=()):
    with sqlite3.connect(DB_PATH) as conn:
        return pd.read_sql_query(query, conn, params=params)

devices = read_sql_df('SELECT id, timezone FROM devices ORDER BY id')
devices


Unnamed: 0,id,timezone
0,1,America/New_York
1,2,Europe/Berlin


In [13]:
def to_device_local(ts_utc, tz):
    ts = pd.to_datetime(ts_utc, utc=True)
    return ts.tz_convert(ZoneInfo(tz)).tz_convert(None)

def make_features(df, device_id, device_tz):
    if df.empty: return df
    idx_local = pd.DatetimeIndex([to_device_local(t, device_tz) for t in df.index])
    df_local = df.copy()
    df_local.index = idx_local
    df_local['hour'] = df_local.index.hour
    df_local['dow'] = df_local.index.dayofweek
    df_local['hour_sin'] = np.sin(2*np.pi*df_local['hour']/24)
    df_local['hour_cos'] = np.cos(2*np.pi*df_local['hour']/24)
    df_local['dow_sin'] = np.sin(2*np.pi*df_local['dow']/7)
    df_local['dow_cos'] = np.cos(2*np.pi*df_local['dow']/7)
    df_local['device_id'] = int(device_id)
    # Lags
    for k in [1,2,3,6,12,24,48]:
        df_local[f'lag_{k}'] = df_local['consumption'].shift(k)
    # Rolling mean
    df_local['roll24'] = df_local['consumption'].rolling(24).mean()
    return df_local.dropna()

def fetch_device_df(device_id, device_tz, days=120):
    end_utc = pd.Timestamp.utcnow().floor('H')
    start_utc = end_utc - pd.Timedelta(days=days)
    df = read_sql_df(
        'SELECT ts_utc, consumption FROM readings WHERE device_id=? AND ts_utc BETWEEN ? AND ? ORDER BY ts_utc',
        (device_id, start_utc.strftime('%Y-%m-%d %H:%M:%S'), end_utc.strftime('%Y-%m-%d %H:%M:%S'))
    )
    df.index = pd.to_datetime(df['ts_utc'])
    df = df.drop(columns=['ts_utc'])
    return make_features(df, device_id, device_tz)

frames = []
for _, row in devices.iterrows():
    frames.append(fetch_device_df(int(row.id), row.timezone))
full = pd.concat(frames).sort_index()
full.head()


  end_utc = pd.Timestamp.utcnow().floor('H')
  end_utc = pd.Timestamp.utcnow().floor('H')


Unnamed: 0,consumption,hour,dow,hour_sin,hour_cos,dow_sin,dow_cos,device_id,lag_1,lag_2,lag_3,lag_6,lag_12,lag_24,lag_48,roll24
2025-06-13 18:00:00,0.835234,18,4,-1.0,-1.83697e-16,-0.433884,-0.900969,2,0.799217,0.868995,0.931697,1.021872,1.041469,1.169461,1.0,1.059408
2025-06-13 19:00:00,0.939549,19,4,-0.965926,0.258819,-0.433884,-0.900969,2,0.835234,0.799217,0.868995,0.964782,0.946222,1.235856,0.979811,1.047062
2025-06-13 20:00:00,1.033996,20,4,-0.866025,0.5,-0.433884,-0.900969,2,0.939549,0.835234,0.799217,0.927806,1.000716,1.052921,0.926145,1.046273
2025-06-13 21:00:00,1.056594,21,4,-0.707107,0.7071068,-0.433884,-0.900969,2,1.033996,0.939549,0.835234,0.931697,0.948943,1.16474,1.129726,1.041767
2025-06-13 22:00:00,1.194694,22,4,-0.5,0.8660254,-0.433884,-0.900969,2,1.056594,1.033996,0.939549,0.868995,0.940089,1.289538,1.157112,1.037815


In [14]:
# Prepare H2O frame
hf = h2o.H2OFrame(full.reset_index().rename(columns={'index':'timestamp'}))
# Types
hf['device_id'] = hf['device_id'].asfactor()
for c in ['hour','dow']:
    hf[c] = hf[c].asfactor()

# Train/validation split by time (80/20)
N = hf.nrows
split = int(N * 0.8)
hf_train = hf[:split, :]
hf_val = hf[split:, :]

# Predict next-hour consumption -> use lag features; set y
features = [c for c in hf.columns if c not in ['consumption','timestamp']]
y = 'consumption'

aml = H2OAutoML(max_runtime_secs=600, seed=42, sort_metric='RMSE')
aml.train(x=features, y=y, training_frame=hf_train, leaderboard_frame=hf_val)

lb = aml.leaderboard
leader = aml.leader
perf = leader.model_performance(hf_val)
rmse = float(perf.rmse())
rmse


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
00:17:39.513: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


0.35314061356306414

In [15]:
# Save artifacts (MOJO) under a versioned directory and copy to latest/h2o
version = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
version_dir = os.path.join(VERSIONS_DIR, version, 'h2o')
os.makedirs(version_dir, exist_ok=True)

mojo_path = os.path.join(version_dir, 'model.mojo.zip')
leader.download_mojo(mojo_path, get_genmodel_jar=False)
# copy to latest/h2o
shutil.copy(mojo_path, os.path.join(LATEST_DIR, 'model.mojo.zip'))
version_dir


'/Users/andressalguero/Documents/peakguard_api/artifacts/versions/20250811_042804/h2o'

In [17]:
# Display H2O AutoML results: leaderboard and metric summary
import plotly.graph_objects as go

# Leaderboard as pandas table
lb_df = lb.as_data_frame()
# Show top 10 models
lb_head = lb_df.head(10)
lb_head






Unnamed: 0,model_id,rmse,mse,mae,rmsle,mean_residual_deviance
0,StackedEnsemble_BestOfFamily_5_AutoML_1_202508...,0.353141,0.124708,0.13564,0.117388,0.124708
1,GBM_lr_annealing_selection_AutoML_1_20250811_0...,0.353469,0.12494,0.144598,0.119068,0.12494
2,StackedEnsemble_BestOfFamily_7_AutoML_1_202508...,0.354033,0.125339,0.136008,0.117915,0.125339
3,GBM_grid_1_AutoML_1_20250811_01739_model_14,0.354514,0.12568,0.14404,0.119402,0.12568
4,GBM_grid_1_AutoML_1_20250811_01739_model_55,0.355467,0.126357,0.14482,0.119802,0.126357
5,StackedEnsemble_BestOfFamily_4_AutoML_1_202508...,0.356176,0.126862,0.147578,0.12062,0.126862
6,StackedEnsemble_Best1000_1_AutoML_1_20250811_0...,0.356514,0.127102,0.141554,0.119573,0.127102
7,StackedEnsemble_AllModels_2_AutoML_1_20250811_...,0.356516,0.127104,0.149719,0.121078,0.127104
8,GBM_grid_1_AutoML_1_20250811_01739_model_50,0.356555,0.127131,0.143833,0.12021,0.127131
9,StackedEnsemble_AllModels_4_AutoML_1_20250811_...,0.356818,0.127319,0.143037,0.119893,0.127319
