In [1]:
import gymnasium as gym
import numpy as np
import ochre_gym

# 1) Load env with only heating setpoint control
env = ochre_gym.load(
    "basic-v0",
    override_equipment_controls={"HVAC Heating": ["Setpoint"]},
    vectorize_actions=True,
    vectorize_observations=True,
    # Small, stable observation set (edit to taste)
    override_ochre_observations_with_keys=[
        "Temperature - Indoor (C)",
        "Temperature - Outdoor (C)",
        "Energy Price ($)",
        "Hour of Day",           # provided by env if supported; else encode from Datetime in wrapper
    ],
    start_time="2018-12-01 00:00:00",
    end_time="2018-12-31 23:30:00",
    time_res="00:30",
    dr_type="RTP",
    reward_normalization=True,
    thermal_comfort_band=[20.0, 23.0],
    thermal_comfort_unit_penalty=10.0,
    log_to_file=False, log_to_console=False,
)

# 2) Discretize the heating setpoint (e.g., 7 actions from 20.0 to 23.0 by 0.5C)
SETPOINTS = np.round(np.arange(20.0, 23.0 + 0.001, 0.5), 1)  # [20.0, 20.5, ..., 23.0]
A = len(SETPOINTS)

class HeatingSetpointDiscrete(gym.ActionWrapper):
    """ Map discrete index -> continuous action vector expected by env. """
    def __init__(self, env, setpoints):
        super().__init__(env)
        self.setpoints = np.array(setpoints, dtype=np.float32)
        # Env expects a vector (because vectorize_actions=True). For heating-only, shape==(1,)
        self.action_space = gym.spaces.Discrete(len(self.setpoints))
    def action(self, a_idx):
        sp = self.setpoints[a_idx]
        return np.array([sp], dtype=np.float32)

env = HeatingSetpointDiscrete(env, SETPOINTS)


ValueError: A price lookahead of 15 mins is less than or is  not a multiple of the control interval 30 mins.

In [None]:
# Bins per feature (tune these):
Tin_bins  = np.arange(18.0, 26.5, 0.5)   # indoor temp
Tout_bins = np.arange(-20.0,  45.0, 5.0) # outdoor temp
Price_bins= np.array([0.0, 0.1, 0.2, 0.4, 1.0, 5.0])  # price tiers
Hour_bins = np.arange(0, 24+1, 3)        # 8 bins over 24h

BIN_EDGES = [Tin_bins, Tout_bins, Price_bins, Hour_bins]

def discretize(obs_vec):
    # obs order matches override_ochre_observations_with_keys
    tin, tout, price, hour = obs_vec[0], obs_vec[1], obs_vec[2], obs_vec[3]
    vals = [tin, tout, price, hour]
    idxs = []
    for v, edges in zip(vals, BIN_EDGES):
        # np.digitize returns 1..len(edges); convert to 0..len(edges)
        i = int(np.digitize(v, edges, right=False))
        i = max(0, min(i, len(edges)))  # clamp
        idxs.append(i)
    return tuple(idxs)  # tuple is hashable => can index Q-table


In [None]:
from collections import defaultdict

# Hyperparameters
alpha = 0.2        # learning rate
gamma = 0.99       # discount
eps_start, eps_end, eps_decay = 1.0, 0.05, 20_000  # linear decay over steps
episodes = 50
max_steps = 2000

# Q-table: dict-of-dicts â†’ default 0.0
Q = defaultdict(lambda: np.zeros(A, dtype=np.float32))

def epsilon(step):
    # linear decay
    frac = max(0.0, 1.0 - step / eps_decay)
    return eps_end + (eps_start - eps_end) * frac

global_step = 0

for ep in range(episodes):
    obs, info = env.reset()
    s = discretize(obs)
    ep_return = 0.0

    for t in range(max_steps):
        e = epsilon(global_step)
        if np.random.rand() < e:
            a = np.random.randint(A)      # explore
        else:
            a = int(np.argmax(Q[s]))      # exploit

        next_obs, r, terminated, truncated, info = env.step(a)
        s_next = discretize(next_obs)

        # Q-learning update
        best_next = np.max(Q[s_next])
        td_target = r + gamma * best_next
        td_error  = td_target - Q[s][a]
        Q[s][a]  += alpha * td_error

        s = s_next
        ep_return += r
        global_step += 1

        if terminated or truncated:
            break

    print(f"Episode {ep+1:02d} | steps={t+1} | return={ep_return:.2f}")


In [None]:
obs, info = env.reset()
s = discretize(obs)
total = 0.0
for t in range(2000):
    a = int(np.argmax(Q[s]))
    obs, r, terminated, truncated, info = env.step(a)
    s = discretize(obs)
    total += r
    if terminated or truncated:
        break
print("Evaluation return:", total)


In [None]:
from stable_baselines3 import DQN
model = DQN("MlpPolicy", env, learning_rate=1e-4, buffer_size=100_000, verbose=1)
model.learn(total_timesteps=400_000)
