In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import matplotlib.dates as mdates

import ochre_gym

import gymnasium as gym

from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback

In [2]:
# print("--- DIAGNOSTIC START ---")

# env = ochre_gym.load(
#     "bldg0112631-up11",
#     override_equipment_controls={"HVAC Heating": ["Setpoint"]},
#     vectorize_actions=True,
#     vectorize_observations=False,
    
#     start_time="2018-01-01 00:00:00",
#     episode_duration="1 days",
#     time_res="00:30",
#     lookahead="12:00",            
    
#     dr_type="TOU",
#     dr_subfolder="denver",
#     tou_price_file="time_of_use_price.csv"
# )

# obs, info = env.reset()

# print("\n=== FULL OBSERVATION KEY LIST (with sample values) ===")
# for i, (k, v) in enumerate(sorted(obs.items())):
#     print(f"{i+1:02d}: {k:40s} -> {v}")
# print("======================================================")

# env.close()

In [3]:
# -------------------------------------------------------------
# Environment configuration
# -------------------------------------------------------------

ENV_NAME = "bldg0112631-up11"

# Base observation keys (current values only)
BASE_OBS_KEYS = [
    "Temperature - Indoor (C)",
    "Temperature - Outdoor (C)",
    "Energy Price ($)",
    "Hour of day",
]

def build_obs_keys_with_forecast():
    """
    Build OBS_KEYS.

    Note:
    OCHRE vectorizes observations in alphabetical order of the keys.
    We therefore sort BASE_OBS_KEYS here so that the positions in the
    vectorized obs match the indices we derive below. Future temperature
    and price features (next 12 hours) will be appended via a separate
    wrapper, not via OCHRE's built-in observation keys.
    """
    return sorted(BASE_OBS_KEYS)

# Observation keys selected from the OCHRE environment
OBS_KEYS = build_obs_keys_with_forecast()

# Indices for vectorized observations (alphabetical order matching OCHRE)
INDOOR_TEMP_IDX  = OBS_KEYS.index("Temperature - Indoor (C)")
OUTDOOR_TEMP_IDX = OBS_KEYS.index("Temperature - Outdoor (C)")
PRICE_IDX        = OBS_KEYS.index("Energy Price ($)")
HOUR_IDX         = OBS_KEYS.index("Hour of day")


def make_env(start_time: str, episode_duration: str):
    """
    Helper to create an OCHRE environment for a given start time and duration.
    The environment exposes a continuous heating setpoint action for SAC.
    """
    base_env = ochre_gym.load(
        ENV_NAME,
        override_equipment_controls={"HVAC Heating": ["Setpoint"]},
        vectorize_actions=True,
        vectorize_observations=True,
        override_ochre_observations_with_keys=OBS_KEYS,

        # Episode settings
        start_time=start_time,
        episode_duration=episode_duration,
        time_res="00:30",
        lookahead="12:00",  # still use 12h lookahead internally

        # Demand response configuration
        dr_type="TOU",
        dr_subfolder="denver",
        tou_price_file="time_of_use_price.csv",

        # Comfort constraints and reward scaling
        thermal_comfort_band_low=20,
        thermal_comfort_band_high=23,
        thermal_comfort_unit_penalty=0.0,
        reward_scale=0.1,

        # Logging options
        log_to_file=False,
        log_to_console=False,
    )
    return base_env

In [None]:
# -------------------------------------------------------------
# Varying thermal comfort band (User-defined Logic)
# -------------------------------------------------------------
def get_comfort_band(hour_of_day: float):
    """
    Returns (low, high) comfort band in °C based on the hour of day.

    - 19–23 °C for h in [18:00, 24:00) U [00:00, 09:00)
      (i.e., 09:00 itself is relaxed)
    - 17–26 °C for h in [09:00, 18:00)
    """
    h = float(hour_of_day) % 24.0

    # Night / early morning / boundary hours (occupied)
    if (h >= 18.0) or (h < 9.0):
        return 19.0, 23.0
    # Daytime (unoccupied / relaxed)
    else:
        return 17.0, 26.0


def is_occupied(hour_of_day: float) -> bool:
    """
    Returns True if the building is considered occupied.
    Occupied from 18:00–24:00 and 00:00–09:00 (excluding 09:00).
    """
    h = float(hour_of_day) % 24.0
    return (h >= 18.0) or (h < 9.0)


def get_preferred_temp(hour_of_day: float):
    """
    Returns the preferred indoor temperature (°C) if occupied,
    otherwise None. Preference is 21 °C during occupancy.
    """
    if is_occupied(hour_of_day):
        return 21.0
    return None


# -------------------------------------------------------------
# Wrapper to apply varying thermal comfort band to the reward
# -------------------------------------------------------------
class VariableComfortRewardWrapper(gym.Wrapper):
    def __init__(
        self,
        env,
        comfort_unit_penalty: float = 0.0,
        reward_scale: float = 0.1,
        energy_cost_multiplier: float = 1.0,
        preferred_temp: float | None = 21.0,
        preferred_temp_weight: float = 0.0,  # set >0 to activate preference
        peak_price_threshold: float | None = None,
        peak_hours: tuple[tuple[float, float], ...] | None = None,
    ):
        """
        env: base OCHRE env with thermal_comfort_unit_penalty=0.0 in its config
        comfort_unit_penalty: penalty per °C deviation from comfort band
        reward_scale: same scale factor used in the base env
        energy_cost_multiplier: Factor to multiply the base energy cost by
        preferred_temp: target indoor temperature during occupancy (°C)
        preferred_temp_weight: weight for |T_in - preferred_temp| penalty

        peak_price_threshold:
            If not None, periods with price >= this are treated as "peak".
        peak_hours:
            Fallback peak window(s) in hour-of-day if no threshold is provided.
            Example: ((17.0, 21.0),) means 17:00–21:00 is peak.
        """
        super().__init__(env)
        self.comfort_unit_penalty = comfort_unit_penalty
        self.reward_scale = reward_scale
        self.energy_cost_multiplier = energy_cost_multiplier
        self.preferred_temp = preferred_temp
        self.preferred_temp_weight = preferred_temp_weight

        self.peak_price_threshold = peak_price_threshold
        self.peak_hours = peak_hours if peak_hours is not None else ((17.0, 21.0),)

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

    # NEW: define hour purely from what RL sees (obs_1 / "Hour of day")
    def _hour_from_info(self, info, obs=None) -> float:
        # 1) Prefer observation vector (this is what RL uses)
        if obs is not None:
            obs_arr = np.asarray(obs).ravel()
            if obs_arr.size > HOUR_IDX:
                return float(obs_arr[HOUR_IDX])

        # 2) Fallback: "Hour of day" in info if present
        if isinstance(info, dict) and ("Hour of day" in info):
            return float(info["Hour of day"])

        # 3) Last resort
        return 0.0

    def _get_current_price(self, info, obs=None) -> float:
        """Extract current energy price from info or obs, if available."""
        if isinstance(info, dict) and ("Energy Price ($)" in info):
            try:
                return float(info["Energy Price ($)"])
            except (TypeError, ValueError):
                pass

        if obs is not None:
            obs_arr = np.asarray(obs).ravel()
            if obs_arr.size > PRICE_IDX:
                try:
                    return float(obs_arr[PRICE_IDX])
                except (TypeError, ValueError):
                    pass

        return np.nan

    def _is_peak_price_period(self, hour_of_day: float, price: float) -> bool:
        """
        Returns True if we consider this time step to be a peak-price period.
        - If peak_price_threshold is set and price is known, use that.
        - Otherwise, fall back to the configured peak_hours window(s).
        """
        # 1) If we have a threshold and a valid price, use it
        if (self.peak_price_threshold is not None) and (not np.isnan(price)):
            return price >= self.peak_price_threshold

        # 2) Fallback: time-of-day window
        h = float(hour_of_day) % 24.0
        for start, end in self.peak_hours:
            if start <= end:
                if (h >= start) and (h < end):
                    return True
            else:
                # window wrapping midnight, e.g., 22–03
                if (h >= start) or (h < end):
                    return True
        return False

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)

        # 1) scale base energy reward
        scaled_energy_reward = reward * self.energy_cost_multiplier

        # 2) hour-of-day and comfort band (aligned with RL obs)
        hour_of_day = self._hour_from_info(info, obs)
        comfort_low, comfort_high = get_comfort_band(hour_of_day)

        # 3) indoor temperature
        if isinstance(info, dict) and ("Temperature - Indoor (C)" in info):
            indoor_temp = float(info["Temperature - Indoor (C)"])
        else:
            obs_arr = np.asarray(obs).ravel()
            indoor_temp = (
                float(obs_arr[INDOOR_TEMP_IDX])
                if obs_arr.size > INDOOR_TEMP_IDX
                else np.nan
            )

        # 3b) current price and peak check
        current_price = self._get_current_price(info, obs)
        is_peak = self._is_peak_price_period(hour_of_day, current_price)

        # 4) soft comfort-band violation
        if indoor_temp < comfort_low:
            comfort_violation = comfort_low - indoor_temp
        elif indoor_temp > comfort_high:
            comfort_violation = indoor_temp - comfort_high
        else:
            comfort_violation = 0.0

        comfort_penalty_term = (
            self.reward_scale * self.comfort_unit_penalty * comfort_violation
        )

        # 5) preference for 21 °C during occupancy, only OFF-PEAK
        pref_penalty_term = 0.0
        occ = is_occupied(hour_of_day)
        if (
            occ
            and (not is_peak)
            and self.preferred_temp is not None
            and self.preferred_temp_weight > 0.0
            and not np.isnan(indoor_temp)
        ):
            temp_error = abs(indoor_temp - self.preferred_temp)
            pref_penalty_term = (
                self.reward_scale * self.preferred_temp_weight * temp_error
            )

        # 6) total reward
        new_reward = scaled_energy_reward - comfort_penalty_term - pref_penalty_term

        # logging
        info = dict(info)
        info["hour_used_for_band"] = hour_of_day
        info["comfort_low"] = comfort_low
        info["comfort_high"] = comfort_high
        info["comfort_violation"] = comfort_violation
        info["original_energy_reward"] = reward
        info["scaled_energy_reward"] = scaled_energy_reward
        info["custom_reward"] = new_reward
        info["occupied"] = occ
        info["preferred_temp"] = self.preferred_temp
        info["pref_penalty_term"] = pref_penalty_term
        info["current_price_for_pref"] = current_price
        info["is_peak_price_period"] = is_peak
        info["pref_penalty_active"] = bool(
            occ
            and (not is_peak)
            and self.preferred_temp is not None
            and self.preferred_temp_weight > 0.0
            and not np.isnan(indoor_temp)
        )

        return obs, new_reward, terminated, truncated, info

In [None]:
# -------------------------------------------------------------
# Wrapper to append 12-hour forecasts (T_out, price) to obs
# -------------------------------------------------------------
class ForecastObsWrapper(gym.Wrapper):
    def __init__(
        self,
        env,
        start_time: str,
        episode_duration: str,
        horizon_hours: float = 12.0,
        time_res_minutes: int = 30,
    ):
        """
        Wraps an OCHRE env and augments the observation with future
        outdoor temperature and energy prices over a lookahead horizon.

        The base env exposes the 4 core obs defined in OBS_KEYS, which are
        vectorized in alphabetical order by OCHRE; semantics should always
        be accessed via the indices PRICE_IDX, HOUR_IDX, INDOOR_TEMP_IDX,
        and OUTDOOR_TEMP_IDX rather than assuming a fixed position.

        This wrapper appends:
        [T_out(t+1), ..., T_out(t+H),
         price(t+1), ..., price(t+H)]
        where H = horizon_hours / time_res_minutes * 60.
        """
        super().__init__(env)
        self.start_time = start_time
        self.episode_duration = episode_duration
        self.horizon_hours = horizon_hours
        self.time_res_minutes = time_res_minutes
        self.horizon_steps = int(horizon_hours * 60 / time_res_minutes)

        # Internal step counter within the episode
        self._current_step = 0

        # Precompute outdoor temperature and price for this episode window
        self._build_forecast_index()

        # Extend observation space: original dims + 2 * horizon_steps
        orig_space = env.observation_space
        orig_low = np.asarray(orig_space.low, dtype=np.float32).ravel()
        orig_high = np.asarray(orig_space.high, dtype=np.float32).ravel()

        extra_dim = 2 * self.horizon_steps  # future temps + future prices
        extra_low = np.full(extra_dim, -np.inf, dtype=np.float32)
        extra_high = np.full(extra_dim, np.inf, dtype=np.float32)

        self.observation_space = gym.spaces.Box(
            low=np.concatenate([orig_low, extra_low]),
            high=np.concatenate([orig_high, extra_high]),
            dtype=np.float32,
        )

    def _build_forecast_index(self):
        """
        Build a mapping from step index -> {T_out, price} for the whole
        episode window by running a temporary OCHRE env once.
        """
        tmp_env = ochre_gym.load(
            ENV_NAME,
            override_equipment_controls={"HVAC Heating": ["Setpoint"]},
            vectorize_actions=True,
            vectorize_observations=True,
            override_ochre_observations_with_keys=OBS_KEYS,

            start_time=self.start_time,
            episode_duration=self.episode_duration,
            time_res="00:30",
            lookahead="12:00",

            dr_type="TOU",
            dr_subfolder="denver",
            tou_price_file="time_of_use_price.csv",

            thermal_comfort_band_low=20,
            thermal_comfort_band_high=23,
            thermal_comfort_unit_penalty=0.0,
            reward_scale=0.1,

            log_to_file=False,
            log_to_console=False,
        )

        obs, info = tmp_env.reset()
        self._forecast_data = {}
        step_idx = 0

        while True:
            obs_vec = np.asarray(obs).ravel()

            # Prefer info keys; fall back to obs vector if needed
            if isinstance(info, dict) and ("Temperature - Outdoor (C)" in info):
                t_out = info["Temperature - Outdoor (C)"]
            else:
                t_out = obs_vec[OUTDOOR_TEMP_IDX]

            if isinstance(info, dict) and ("Energy Price ($)" in info):
                price = info["Energy Price ($)"]
            else:
                price = obs_vec[PRICE_IDX]

            self._forecast_data[step_idx] = {
                "Temperature - Outdoor (C)": float(t_out),
                "Energy Price ($)": float(price),
            }

            # Any reasonable fixed action is fine; T_out and price don't depend on action
            action = np.array([21.0], dtype=np.float32)
            obs, r, terminated, truncated, info = tmp_env.step(action)
            step_idx += 1
            if terminated or truncated:
                break

        tmp_env.close()
        self._episode_length = step_idx  # total number of steps in the episode

    def _augment_observation(self, obs, step_idx, info):
        """
        Concatenate current obs with future T_out and price over the horizon.
        """
        obs_vec = np.asarray(obs).ravel()

        future_temps = []
        future_prices = []

        for k in range(1, self.horizon_steps + 1):
            idx = step_idx + k
            if idx >= self._episode_length:
                # Beyond last step: repeat last known values
                last = self._forecast_data[self._episode_length - 1]
                temp_k = last["Temperature - Outdoor (C)"]
                price_k = last["Energy Price ($)"]
            else:
                data_k = self._forecast_data[idx]
                temp_k = data_k["Temperature - Outdoor (C)"]
                price_k = data_k["Energy Price ($)"]

            future_temps.append(temp_k)
            future_prices.append(price_k)

        extra = np.asarray(future_temps + future_prices, dtype=obs_vec.dtype)
        return np.concatenate([obs_vec, extra], axis=0)

    def reset(self, **kwargs):
        self._current_step = 0
        obs, info = self.env.reset(**kwargs)
        obs_aug = self._augment_observation(obs, self._current_step, info)
        return obs_aug, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self._current_step += 1
        obs_aug = self._augment_observation(obs, self._current_step, info)
        return obs_aug, reward, terminated, truncated, info

In [None]:
# -------------------------------------------------------------
# Sanity check: verify comfort band + key observations
# -------------------------------------------------------------
# Ensure start times match
START_TIME_STR = "2018-02-01 00:00:00"

debug_env = VariableComfortRewardWrapper(
    make_env(START_TIME_STR, "1 days"),
    comfort_unit_penalty=50.0,
    reward_scale=0.1,
    energy_cost_multiplier=15.0,
    preferred_temp=21.0,
    preferred_temp_weight=15.0,
    peak_hours=((17.0, 21.0),),  # <-- explicit: 17:00–21:00 is peak
    # or use: peak_price_threshold=0.21
)

obs, info = debug_env.reset()
rows = []

start_ts = pd.Timestamp(START_TIME_STR)

# 48 steps = 24 hours at 30 min/step
for t in range(50):
    # Datetime from info if present, otherwise reconstruct from start time
    dt_info = info.get("Datetime", None) if isinstance(info, dict) else None
    if dt_info is None:
        dt = start_ts + pd.Timedelta(minutes=30 * t)
    else:
        dt = pd.to_datetime(dt_info)

    # Flatten observation vector
    obs_arr = np.asarray(obs).ravel()

    # Semantic obs via indices (for clarity)
    price_obs = obs_arr[PRICE_IDX] if obs_arr.size > PRICE_IDX else np.nan
    hour_obs  = obs_arr[HOUR_IDX]  if obs_arr.size > HOUR_IDX  else np.nan
    tin_obs   = obs_arr[INDOOR_TEMP_IDX] if obs_arr.size > INDOOR_TEMP_IDX else np.nan
    tout_obs  = obs_arr[OUTDOOR_TEMP_IDX] if obs_arr.size > OUTDOOR_TEMP_IDX else np.nan

    row = {
        "Datetime": dt,

        # What the wrapper logged (if present)
        "hour_used_for_band": info.get("hour_used_for_band", np.nan),
        "comfort_low_info": info.get("comfort_low", np.nan),
        "comfort_high_info": info.get("comfort_high", np.nan),
        "comfort_violation_info": info.get("comfort_violation", np.nan),
        "original_energy_reward": info.get("original_energy_reward", np.nan),
        "scaled_energy_reward": info.get("scaled_energy_reward", np.nan),
        "custom_reward": info.get("custom_reward", np.nan),
        "occupied": info.get("occupied", np.nan),
        "preferred_temp": info.get("preferred_temp", np.nan),
        "pref_penalty_term": info.get("pref_penalty_term", np.nan),
        "current_price_for_pref": info.get("current_price_for_pref", np.nan),
        "is_peak_price_period": info.get("is_peak_price_period", np.nan),
        "pref_penalty_active": info.get("pref_penalty_active", np.nan),

        # Observations from info (trusted physical variables)
        "indoor_temp_info": info.get("Temperature - Indoor (C)", np.nan),
        "outdoor_temp_info": info.get("Temperature - Outdoor (C)", np.nan),
        "price_info": info.get("Energy Price ($)", np.nan),
        "hour_info": info.get("Hour of day", np.nan),

        # Semantic obs from vector
        "price_obs": price_obs,
        "hour_obs": hour_obs,
        "T_in_obs": tin_obs,
        "T_out_obs": tout_obs,
    }
    rows.append(row)

    # Fixed action (e.g., 21°C setpoint) – only for advancing the env
    action = np.array([21.0], dtype=np.float32)
    obs, r, terminated, truncated, info = debug_env.step(action)
    if terminated or truncated:
        break

debug_env.close()

debug_df = pd.DataFrame(rows)

# ---- Minimal formatting change for Datetime column ----
if not debug_df.empty:
    # Get date from first row
    first_date = pd.to_datetime(debug_df["Datetime"].iloc[0]).date()
    time_col_name = f"Datetime ({first_date})"

    # Convert to time-only strings
    debug_df[time_col_name] = pd.to_datetime(debug_df["Datetime"]).dt.strftime("%H:%M:%S")

    # Drop original Datetime column
    debug_df = debug_df.drop(columns=["Datetime"])

    # ---- Minimal change: show obs_* mapping in column headers ----
    debug_df = debug_df.rename(
        columns={
            "price_obs": "price_obs (obs_0)",
            "hour_obs": "hour_obs (obs_1)",
            "T_in_obs": "T_in_obs (obs_2)",
            "T_out_obs": "T_out_obs (obs_3)",
        }
    )

    # 2) Info-based variables (env + wrapper)
    info_cols = [
        time_col_name,
        "hour_used_for_band",
        # "comfort_low_info",
        # "comfort_high_info",
        # "comfort_violation_info",
        # "original_energy_reward",
        # "scaled_energy_reward",
        # "custom_reward",
        "occupied",
        # "preferred_temp",
        # "pref_penalty_term",
        "current_price_for_pref",
        "is_peak_price_period",
        "pref_penalty_active",
        # "indoor_temp_info",
        # "outdoor_temp_info",
        # "price_info",
        # "hour_info",
    ]
    info_df = debug_df[info_cols].copy()

    # 3) Observation vector (what RL sees)
    obs_cols = [
        time_col_name,
        "price_obs (obs_0)",
        "hour_obs (obs_1)",
        "T_in_obs (obs_2)",
        "T_out_obs (obs_3)",
    ]
    obs_df = debug_df[obs_cols].copy()

    print("=== Info / Reward / Physical Vars ===")
    display(info_df)

    print("=== Observation Vector (RL Inputs) ===")
    display(obs_df)


In [None]:
# -------------------------------------------------------------
# Full augmented observation table (all 52 features)
# -------------------------------------------------------------
HORIZON_HOURS = 12.0
TIME_RES_MIN = 30
H = int(HORIZON_HOURS * 60 / TIME_RES_MIN)   # 12h / 30min = 24 steps
BASE_DIM = len(OBS_KEYS)                     # 4 base obs

EXTRA_TEMP_START = BASE_DIM                  # T_out(t+1..t+H)
EXTRA_PRICE_START = BASE_DIM + H             # price(t+1..t+H)

# Build wrapped env: base -> comfort reward -> forecast obs
full_env = ForecastObsWrapper(
    VariableComfortRewardWrapper(
        make_env("2018-01-06 00:00:00", "1 days"),
        comfort_unit_penalty=50.0,
        reward_scale=0.1,
        energy_cost_multiplier=15.0,
        preferred_temp=21.0,
        preferred_temp_weight=15.0,
        peak_hours=((17.0, 21.0),),   # <-- ensure 17:00–21:00 is treated as peak
        # or alternatively: peak_price_threshold=0.21
    ),
    start_time="2018-01-06 00:00:00",
    episode_duration="1 days",
    horizon_hours=HORIZON_HOURS,
    time_res_minutes=TIME_RES_MIN,
)

obs, info = full_env.reset()
rows = []

start_ts = pd.Timestamp("2018-01-06 00:00:00")

# 48 steps = 24 hours at 30 min/step
for t in range(48):
    # Datetime from info if present, otherwise reconstruct from start time
    dt_info = info.get("Datetime", None) if isinstance(info, dict) else None
    if dt_info is None:
        dt = start_ts + pd.Timedelta(minutes=TIME_RES_MIN * t)
    else:
        dt = pd.to_datetime(dt_info)

    obs_arr = np.asarray(obs).ravel()

    row = {
        "step": t,
        "Datetime": dt,
        "obs_dim": obs_arr.size,
        # base obs via semantic indices
        "price_now":  obs_arr[PRICE_IDX],
        "hour_now":   obs_arr[HOUR_IDX],
        "T_in_now":   obs_arr[INDOOR_TEMP_IDX],
        "T_out_now":  obs_arr[OUTDOOR_TEMP_IDX],
    }

    # All future outdoor temperatures
    for k in range(1, H + 1):
        row[f"T_out+{k}"] = obs_arr[EXTRA_TEMP_START + k - 1]

    # All future prices
    for k in range(1, H + 1):
        row[f"price+{k}"] = obs_arr[EXTRA_PRICE_START + k - 1]

    rows.append(row)

    action = np.array([21.0], dtype=np.float32)  # fixed setpoint just to advance
    obs, r, terminated, truncated, info = full_env.step(action)
    if terminated or truncated:
        break

full_env.close()

full_obs_df = pd.DataFrame(rows)

# Optional: pretty time-only column like other debug tables
if not full_obs_df.empty:
    first_date = pd.to_datetime(full_obs_df["Datetime"].iloc[0]).date()
    time_col_name = f"Datetime ({first_date})"
    full_obs_df[time_col_name] = pd.to_datetime(full_obs_df["Datetime"]).dt.strftime("%H:%M:%S")
    full_obs_df = full_obs_df.drop(columns=["Datetime"])
    # Put time + step first
    cols = [time_col_name, "step"] + [c for c in full_obs_df.columns if c not in [time_col_name, "step"]]
    full_obs_df = full_obs_df[cols]

full_obs_df


In [None]:
# -------------------------------------------------------------
# Monitor-wrapped training env (31-day episode starting Jan 1)
#   Stack: OCHRE -> ComfortReward -> ForecastObs -> Monitor
# -------------------------------------------------------------
def make_train_env():
    start_time = "2018-01-01 00:00:00"
    episode_duration = "31 days"  # Train on full month episodes

    base = make_env(start_time, episode_duration)

    # Apply the varying comfort-band reward wrapper
    base = VariableComfortRewardWrapper(
        base,
        comfort_unit_penalty=50.0,
        reward_scale=0.1,
        energy_cost_multiplier=15.0,
        preferred_temp=21.0,
        preferred_temp_weight=15.0,
        peak_hours=((17.0, 21.0),),  # <-- explicit: 17:00–21:00 is peak
    )

    # Append 12-hour forecasts of T_out and price to the observation
    base = ForecastObsWrapper(
        base,
        start_time=start_time,
        episode_duration=episode_duration,
        horizon_hours=12.0,
        time_res_minutes=30,
    )

    return Monitor(base)

train_env = DummyVecEnv([make_train_env])


# -------------------------------------------------------------
# Callback to record episode returns during training
# -------------------------------------------------------------
class EpisodeRewardCallback(BaseCallback):
    def __init__(self, verbose: int = 0):
        super().__init__(verbose)
        self.ep_returns: list[float] = []

    def _on_step(self) -> bool:
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" in info:
                # total reward for that finished episode
                self.ep_returns.append(info["episode"]["r"])
        return True


# -------------------------------------------------------------
# SAC model definition
# -------------------------------------------------------------
sac_model = SAC(
    "MlpPolicy",
    train_env,
    verbose=2,
    learning_rate=3e-4,
    gamma=0.99,
    buffer_size=50_000,
    batch_size=256,
    tau=0.005,
    train_freq=1,
    gradient_steps=1,
    ent_coef="auto",
)

# -------------------------------------------------------------
# Train SAC and log episode returns
# -------------------------------------------------------------
callback = EpisodeRewardCallback()

sac_model.learn(
    total_timesteps=60_000,
    log_interval=10,
    callback=callback,
)

ep_returns_sac = np.asarray(callback.ep_returns, dtype=float)
print(f"Recorded {len(ep_returns_sac)} finished episodes.")

In [None]:
# -------------------------------------------------------------
# Plot: evolution of episode returns during training
# -------------------------------------------------------------
if len(ep_returns_sac) > 0:
    episodes = np.arange(1, len(ep_returns_sac) + 1)

    plt.figure(figsize=(7, 4))
    plt.plot(episodes, ep_returns_sac, label="Episode Return", alpha=0.6)

    # moving average
    window = 5
    if len(ep_returns_sac) >= window:
        ma = np.convolve(ep_returns_sac, np.ones(window) / window, mode="valid")
        plt.plot(
            np.arange(window, len(ep_returns_sac) + 1),
            ma,
            label=f"{window}-episode moving avg",
            linewidth=2,
        )

    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("No episode returns were logged.")


In [None]:
# ===========================
# Helpers
# ===========================
def is_occupied(hour_of_day: float) -> bool:
    """Restoring function to fix TypeError in environment wrapper."""
    h = float(hour_of_day) % 24.0
    return (h >= 18.0) or (h < 9.0)

def safe_datetime(info, t, start="2018-02-01", minutes=30):
    dt = info.get("Datetime", None) if isinstance(info, dict) else None
    if dt is None:
        return pd.Timestamp(start) + pd.Timedelta(minutes=minutes * t)
    if isinstance(dt, str):
        dt = pd.to_datetime(dt)
    return dt

def extract_indoor_temp(obs, info):
    # Prefer info dict (ground truth)
    if isinstance(info, dict) and ("Temperature - Indoor (C)" in info):
        return float(info["Temperature - Indoor (C)"])
    # Fallback: Index 2 matches sorted OBS_KEYS
    obs = np.asarray(obs).ravel()
    return float(obs[2]) if obs.size >= 3 else np.nan

def extract_outdoor_temp(obs, info):
    # Prefer info dict (ground truth)
    if isinstance(info, dict) and ("Temperature - Outdoor (C)" in info):
        return float(info["Temperature - Outdoor (C)"])
    # Fallback: Index 3 matches sorted OBS_KEYS
    obs = np.asarray(obs).ravel()
    return float(obs[3]) if obs.size >= 4 else np.nan

def extract_obs_components(obs):
    """
    obs[0] = price_now
    obs[1] = hour_now
    obs[2] = T_in_now
    obs[3] = T_out_now
    """
    obs = np.asarray(obs).ravel()
    price = float(obs[0]) if obs.size >= 1 else np.nan
    hour  = float(obs[1]) if obs.size >= 2 else np.nan
    tin   = float(obs[2]) if obs.size >= 3 else np.nan
    tout  = float(obs[3]) if obs.size >= 4 else np.nan
    return tin, tout, price, hour

def compute_comfort_bands_from_hour(hours):
    """
    Compute comfort bands based on Ochre's hour observation,
    matching the is_occupied() logic exactly.
    """
    h = np.asarray(hours, dtype=float) % 24.0
    night_mask = (h >= 18.0) | (h < 9.0)
    lows  = np.where(night_mask, 19.0, 17.0)
    highs = np.where(night_mask, 23.0, 26.0)
    return lows, highs


# ===========================
# Baseline Controller
# ===========================
class RealisticThermostat:
    def __init__(self, setpoint=21):
        self.setpoint = np.array([setpoint], dtype=np.float32)

    def predict(self, obs, deterministic=True):
        return self.setpoint, None

# ===========================
# Evaluation Function
# ===========================
def evaluate_policy_with_actions(env, model, max_steps=2000):
    data = []
    obs, info = env.reset()

    # ---- NEW: record initial pre-step state at 00:00 ----
    tin0, tout0, price0, hour0 = extract_obs_components(obs)
    data.append({
        "Datetime": safe_datetime(info, 0),
        "Setpoint_RL": np.nan,  # no action applied yet
        "IndoorTemp_RL": extract_indoor_temp(obs, info),
        "OutdoorTemp_RL": extract_outdoor_temp(obs, info),
        "Tout": tout0,
        "Price": price0,
        "Hour": hour0,
    })
    # -----------------------------------------------------

    for t in range(max_steps):
        action, _ = model.predict(obs, deterministic=True)
        action = np.asarray(action, dtype=np.float32).ravel()
        if action.size == 0:
            raise RuntimeError("Empty action from model.")
        sp = float(action[0])

        obs, r, terminated, truncated, info = env.step(action)
        tin, tout, price, hour = extract_obs_components(obs)

        data.append({
            # use t+1 so fallback time is 00:30, 01:00, ... after the 00:00 row
            "Datetime": safe_datetime(info, t + 1),
            "Setpoint_RL": sp,
            "IndoorTemp_RL": extract_indoor_temp(obs, info),
            "OutdoorTemp_RL": extract_outdoor_temp(obs, info),
            "Tout": tout,
            "Price": price,
            "Hour": hour,
        })

        if terminated or truncated:
            break

        df = pd.DataFrame(data)

    if len(df) >= 2 and np.isnan(df.loc[0, "Setpoint_RL"]):
        df.loc[0, "Setpoint_RL"] = df.loc[1, "Setpoint_RL"]

    return df


# ===========================
# Evaluation Env (UPDATED CONFIG)
# ===========================
def make_eval_env(
    start_time: str = "2018-02-01 00:00:00",
    episode_duration: str = "7 days",
):
    """Create an evaluation env for a given start time and episode duration.
    Defaults reproduce the original 7-day February evaluation window.
    """
    base = make_env(start_time, episode_duration)
    base = VariableComfortRewardWrapper(
        base,
        comfort_unit_penalty=50.0,
        reward_scale=0.1,
        energy_cost_multiplier=15.0,
        preferred_temp=21.0,
        preferred_temp_weight=15.0,
        peak_hours=((17.0, 21.0),),  # <-- ensure eval uses same peak window
    )
    base = ForecastObsWrapper(
        base,
        start_time=start_time,
        episode_duration=episode_duration,
        horizon_hours=12.0,
        time_res_minutes=30,
    )
    return base

# Default: 7-day evaluation window (unchanged behaviour)
env_feb = make_eval_env()

# ===========================
# Run Simulations
# ===========================
print("Running RL Agent...")
env_feb.reset()
rl_df = evaluate_policy_with_actions(env_feb, sac_model)

print("Running Baseline...")
env_feb.reset()
baseline_model = RealisticThermostat(setpoint=21)
base_df = evaluate_policy_with_actions(env_feb, baseline_model)

# Compute time-varying comfort bands based on Datetime
comfort_low_series, comfort_high_series = compute_comfort_bands_from_hour(
    rl_df["Hour"]
)

# ===========================
# Figure 1: Indoor Temperature Comparison
# ===========================
plt.figure(figsize=(12, 6))

# RL Plots
plt.plot(rl_df["Datetime"], rl_df["Setpoint_RL"], color="black", linestyle="-", linewidth=2, label="RL Setpoint")
plt.plot(rl_df["Datetime"], rl_df["IndoorTemp_RL"], color="magenta", linestyle="-", alpha=0.9, linewidth=1.5, label="RL Indoor Temp")

# Baseline Plots
plt.plot(base_df["Datetime"], base_df["Setpoint_RL"], color="black", linestyle="--", linewidth=2, label="Baseline Setpoint (21°C)")
plt.plot(base_df["Datetime"], base_df["IndoorTemp_RL"], color="magenta", linestyle="--", alpha=0.9, linewidth=1.5, label="Baseline Indoor Temp")

# Comfort Bands (STEP PLOTS)
plt.step(
    rl_df["Datetime"],
    comfort_low_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
    label="Comfort band",
)
plt.step(
    rl_df["Datetime"],
    comfort_high_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
)

# Highlight peak price hours (17:00–21:00) as a grey region
_dt_series = pd.to_datetime(rl_df["Datetime"])
_hours_peak = _dt_series.dt.hour + _dt_series.dt.minute / 60.0
_peak_mask = (_hours_peak >= 17.0) & (_hours_peak < 21.0)
_dt_values_peak = _dt_series.to_numpy()
_peak_values = _peak_mask.to_numpy()

_in_peak = False
_peak_start = None
_added_peak_legend = False

for _dt, _peak in zip(_dt_values_peak, _peak_values):
    if _peak and not _in_peak:
        _in_peak = True
        _peak_start = _dt
    elif not _peak and _in_peak:
        # End the peak interval at the start of the first non-peak time
        if not _added_peak_legend:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0, label="Peak price period")
            _added_peak_legend = True
        else:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0)
        _in_peak = False

# If the last timestep is still in peak hours, shade until the final timestamp
if _in_peak and _peak_start is not None:
    if not _added_peak_legend:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0, label="Peak price period")
    else:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0)

plt.xlabel("Datetime", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.ylim(16, 28)
plt.yticks(np.arange(16, 29, 1), fontsize=14)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
plt.legend(loc='upper right', ncol=3, fontsize=12)
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(rotation=30, fontsize=14)
plt.tight_layout()
plt.show()

# ===========================
# Figure 2: Outdoor Temperature
# ===========================
plt.figure(figsize=(12, 6))
plt.plot(rl_df["Datetime"], rl_df["OutdoorTemp_RL"], color="blue", linewidth=2, label="Outdoor Temp (°C)")

plt.xlabel("Datetime", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
plt.ylim(-15, 10)
plt.yticks(np.arange(-15, 11, 5))
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(rotation=30, fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.tight_layout()
plt.show()

# ===========================
# Figure 3: Combined Indoor & Outdoor Temperature
# ===========================
plt.figure(figsize=(12, 6))

# RL and Baseline setpoints and indoor temps
plt.plot(rl_df["Datetime"], rl_df["Setpoint_RL"], color="black", linestyle="-", linewidth=2, label="RL Setpoint")
plt.plot(rl_df["Datetime"], rl_df["IndoorTemp_RL"], color="magenta", linestyle="-", alpha=0.9, linewidth=1.5, label="RL Indoor Temp")

plt.plot(base_df["Datetime"], base_df["Setpoint_RL"], color="black", linestyle="--", linewidth=2, label="Baseline Setpoint (21°C)")
plt.plot(base_df["Datetime"], base_df["IndoorTemp_RL"], color="magenta", linestyle="--", alpha=0.9, linewidth=1.5, label="Baseline Indoor Temp")

# Comfort bands
plt.step(
    rl_df["Datetime"],
    comfort_low_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
    label="Comfort band",
)
plt.step(
    rl_df["Datetime"],
    comfort_high_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
)

# Outdoor temperature
plt.plot(rl_df["Datetime"], rl_df["OutdoorTemp_RL"], color="blue", linewidth=2, label="Outdoor Temp (°C)")

# Grey peak price region (reuse same logic)
_dt_series = pd.to_datetime(rl_df["Datetime"])
_hours_peak = _dt_series.dt.hour + _dt_series.dt.minute / 60.0
_peak_mask = (_hours_peak >= 17.0) & (_hours_peak < 21.0)
_dt_values_peak = _dt_series.to_numpy()
_peak_values = _peak_mask.to_numpy()

_in_peak = False
_peak_start = None
_added_peak_legend = False

for _dt, _peak in zip(_dt_values_peak, _peak_values):
    if _peak and not _in_peak:
        _in_peak = True
        _peak_start = _dt
    elif not _peak and _in_peak:
        if not _added_peak_legend:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0, label="Peak price period")
            _added_peak_legend = True
        else:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0)
        _in_peak = False

if _in_peak and _peak_start is not None:
    if not _added_peak_legend:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0, label="Peak price period")
    else:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0)

plt.xlabel("Datetime", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
plt.ylim(-15, 35)
plt.yticks(np.arange(-15, 36, 5), fontsize=14)
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(rotation=30, fontsize=14)
plt.legend(loc='upper right', ncol=4, fontsize=11)
plt.tight_layout()
plt.show()


In [None]:
# -----------------------------------------------------------------------------
# Display Table: Datetime, OCHRE Internal Clock (Hour), Comfort Band Low/High
# plus RL & Baseline setpoints and indoor temps, a single Outdoor Temp, and Price
# -----------------------------------------------------------------------------

comfort_table = pd.DataFrame({
    "Datetime": rl_df["Datetime"],
    "OCHRE_Hour_Obs": rl_df["Hour"],
    "Comfort_Low": comfort_low_series,
    "Comfort_High": comfort_high_series,
    "Outdoor_Temp": rl_df["OutdoorTemp_RL"],
    "Energy_Price": rl_df["Price"],
    "RL_Setpoint": rl_df["Setpoint_RL"],
    "RL_Indoor_Temp": rl_df["IndoorTemp_RL"],
    "Baseline_Setpoint": base_df["Setpoint_RL"],
    "Baseline_Indoor_Temp": base_df["IndoorTemp_RL"],
})

print("=== Comfort Band + RL/Baseline Verification Table (First 50 Steps) ===")
display(comfort_table.head(50))


In [None]:
# ===========================
# Helpers
# ===========================
def is_occupied(hour_of_day: float) -> bool:
    """Restoring function to fix TypeError in environment wrapper."""
    h = float(hour_of_day) % 24.0
    return (h >= 18.0) or (h < 9.0)

def safe_datetime(info, t, start="2018-02-01", minutes=30):
    dt = info.get("Datetime", None) if isinstance(info, dict) else None
    if dt is None:
        return pd.Timestamp(start) + pd.Timedelta(minutes=minutes * t)
    if isinstance(dt, str):
        dt = pd.to_datetime(dt)
    return dt

def extract_indoor_temp(obs, info):
    # Prefer info dict (ground truth)
    if isinstance(info, dict) and ("Temperature - Indoor (C)" in info):
        return float(info["Temperature - Indoor (C)"])
    # Fallback: Index 2 matches sorted OBS_KEYS
    obs = np.asarray(obs).ravel()
    return float(obs[2]) if obs.size >= 3 else np.nan

def extract_outdoor_temp(obs, info):
    # Prefer info dict (ground truth)
    if isinstance(info, dict) and ("Temperature - Outdoor (C)" in info):
        return float(info["Temperature - Outdoor (C)"])
    # Fallback: Index 3 matches sorted OBS_KEYS
    obs = np.asarray(obs).ravel()
    return float(obs[3]) if obs.size >= 4 else np.nan

def extract_obs_components(obs):
    """
    obs[0] = price_now
    obs[1] = hour_now
    obs[2] = T_in_now
    obs[3] = T_out_now
    """
    obs = np.asarray(obs).ravel()
    price = float(obs[0]) if obs.size >= 1 else np.nan
    hour  = float(obs[1]) if obs.size >= 2 else np.nan
    tin   = float(obs[2]) if obs.size >= 3 else np.nan
    tout  = float(obs[3]) if obs.size >= 4 else np.nan
    return tin, tout, price, hour

def compute_comfort_bands_from_hour(hours):
    """
    Compute comfort bands based on Ochre's hour observation,
    matching the is_occupied() logic exactly.
    """
    h = np.asarray(hours, dtype=float) % 24.0
    night_mask = (h >= 18.0) | (h < 9.0)
    lows  = np.where(night_mask, 19.0, 17.0)
    highs = np.where(night_mask, 23.0, 26.0)
    return lows, highs


# ===========================
# Baseline Controller
# ===========================
class RealisticThermostat:
    def __init__(self, setpoint=21):
        self.setpoint = np.array([setpoint], dtype=np.float32)

    def predict(self, obs, deterministic=True):
        return self.setpoint, None

# ===========================
# Evaluation Function (1-day evaluation)
# ===========================
def evaluate_policy_with_actions_1day(env, model, max_steps=2000):
    data = []
    obs, info = env.reset()

    # ---- NEW: record initial pre-step state at 00:00 ----
    tin0, tout0, price0, hour0 = extract_obs_components(obs)
    data.append({
        "Datetime": safe_datetime(info, 0),
        "Setpoint_RL": np.nan,  # no action applied yet
        "IndoorTemp_RL": extract_indoor_temp(obs, info),
        "OutdoorTemp_RL": extract_outdoor_temp(obs, info),
        "Tout": tout0,
        "Price": price0,
        "Hour": hour0,
    })
    # -----------------------------------------------------

    for t in range(max_steps):
        action, _ = model.predict(obs, deterministic=True)
        action = np.asarray(action, dtype=np.float32).ravel()
        if action.size == 0:
            raise RuntimeError("Empty action from model.")
        sp = float(action[0])

        obs, r, terminated, truncated, info = env.step(action)
        tin, tout, price, hour = extract_obs_components(obs)

        data.append({
            # use t+1 so fallback time is 00:30, 01:00, ... after the 00:00 row
            "Datetime": safe_datetime(info, t + 1),
            "Setpoint_RL": sp,
            "IndoorTemp_RL": extract_indoor_temp(obs, info),
            "OutdoorTemp_RL": extract_outdoor_temp(obs, info),
            "Tout": tout,
            "Price": price,
            "Hour": hour,
        })

        if terminated or truncated:
            break

        df = pd.DataFrame(data)

    if len(df) >= 2 and np.isnan(df.loc[0, "Setpoint_RL"]):
        df.loc[0, "Setpoint_RL"] = df.loc[1, "Setpoint_RL"]

    return df


# ===========================
# Evaluation Env (UPDATED CONFIG)
# ===========================
def make_eval_env(
    start_time: str = "2018-02-03 00:00:00",
    episode_duration: str = "1 days",
):
    """Create an evaluation env for a given start time and episode duration.
    Defaults reproduce the original 7-day February evaluation window.
    """
    base = make_env(start_time, episode_duration)
    base = VariableComfortRewardWrapper(
        base,
        comfort_unit_penalty=50.0,
        reward_scale=0.1,
        energy_cost_multiplier=15.0,
        preferred_temp=21.0,
        preferred_temp_weight=15.0,
        peak_hours=((17.0, 21.0),),  # <-- ensure eval uses same peak window
    )
    base = ForecastObsWrapper(
        base,
        start_time=start_time,
        episode_duration=episode_duration,
        horizon_hours=12.0,
        time_res_minutes=30,
    )
    return base

# Default: 7-day evaluation window (unchanged behaviour)
env_feb = make_eval_env()

# ===========================
# Run Simulations
# ===========================
print("Running RL Agent...")
env_feb.reset()
rl_df = evaluate_policy_with_actions_1day(env_feb, sac_model)

print("Running Baseline...")
env_feb.reset()
baseline_model = RealisticThermostat(setpoint=21)
base_df = evaluate_policy_with_actions_1day(env_feb, baseline_model)

# Compute time-varying comfort bands based on Datetime
comfort_low_series, comfort_high_series = compute_comfort_bands_from_hour(
    rl_df["Hour"]
)

# ===========================
# Figure 1: Indoor Temperature Comparison
# ===========================
plt.figure(figsize=(12, 6))

# RL Plots
plt.plot(rl_df["Datetime"], rl_df["Setpoint_RL"], color="black", linestyle="-", linewidth=2, label="RL Setpoint")
plt.plot(rl_df["Datetime"], rl_df["IndoorTemp_RL"], color="magenta", linestyle="-", alpha=0.9, linewidth=1.5, label="RL Indoor Temp")

# Baseline Plots
plt.plot(base_df["Datetime"], base_df["Setpoint_RL"], color="black", linestyle="--", linewidth=2, label="Baseline Setpoint (21°C)")
plt.plot(base_df["Datetime"], base_df["IndoorTemp_RL"], color="magenta", linestyle="--", alpha=0.9, linewidth=1.5, label="Baseline Indoor Temp")

# Comfort Bands (STEP PLOTS)
plt.step(
    rl_df["Datetime"],
    comfort_low_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
    label="Comfort band",
)
plt.step(
    rl_df["Datetime"],
    comfort_high_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
)

# Highlight peak price hours (17:00–21:00) as a grey region
_dt_series = pd.to_datetime(rl_df["Datetime"])
_hours_peak = _dt_series.dt.hour + _dt_series.dt.minute / 60.0
_peak_mask = (_hours_peak >= 17.0) & (_hours_peak < 21.0)
_dt_values_peak = _dt_series.to_numpy()
_peak_values = _peak_mask.to_numpy()

_in_peak = False
_peak_start = None
_added_peak_legend = False

for _dt, _peak in zip(_dt_values_peak, _peak_values):
    if _peak and not _in_peak:
        _in_peak = True
        _peak_start = _dt
    elif not _peak and _in_peak:
        # End the peak interval at the start of the first non-peak time
        if not _added_peak_legend:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0, label="Peak price period")
            _added_peak_legend = True
        else:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0)
        _in_peak = False

# If the last timestep is still in peak hours, shade until the final timestamp
if _in_peak and _peak_start is not None:
    if not _added_peak_legend:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0, label="Peak price period")
    else:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0)

plt.xlabel("Hour of Day", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.ylim(16, 28)
plt.yticks(np.arange(16, 29, 1), fontsize=14)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
ax1 = plt.gca()
ax1.xaxis.set_major_locator(mdates.HourLocator(interval=1))
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%H"))
plt.legend(loc='upper right', ncol=3, fontsize=12)
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(fontsize=14)
plt.tight_layout()
plt.show()

# ===========================
# Figure 2: Outdoor Temperature
# ===========================
plt.figure(figsize=(12, 6))
plt.plot(rl_df["Datetime"], rl_df["OutdoorTemp_RL"], color="blue", linewidth=2, label="Outdoor Temp (°C)")

plt.xlabel("Hour of Day", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
plt.ylim(-15, 10)
plt.yticks(np.arange(-15, 11, 5))
ax2 = plt.gca()
ax2.xaxis.set_major_locator(mdates.HourLocator(interval=1))
ax2.xaxis.set_major_formatter(mdates.DateFormatter("%H"))
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.tight_layout()
plt.show()

# ===========================
# Figure 3: Combined Indoor & Outdoor Temperature
# ===========================
plt.figure(figsize=(12, 6))

# RL and Baseline setpoints and indoor temps
plt.plot(rl_df["Datetime"], rl_df["Setpoint_RL"], color="black", linestyle="-", linewidth=2, label="RL Setpoint")
plt.plot(rl_df["Datetime"], rl_df["IndoorTemp_RL"], color="magenta", linestyle="-", alpha=0.9, linewidth=1.5, label="RL Indoor Temp")

plt.plot(base_df["Datetime"], base_df["Setpoint_RL"], color="black", linestyle="--", linewidth=2, label="Baseline Setpoint (21°C)")
plt.plot(base_df["Datetime"], base_df["IndoorTemp_RL"], color="magenta", linestyle="--", alpha=0.9, linewidth=1.5, label="Baseline Indoor Temp")

# Comfort bands
plt.step(
    rl_df["Datetime"],
    comfort_low_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
    label="Comfort band",
)
plt.step(
    rl_df["Datetime"],
    comfort_high_series,
    where="pre",
    color="red",
    linestyle=":",
    linewidth=2,
)

# Outdoor temperature
plt.plot(rl_df["Datetime"], rl_df["OutdoorTemp_RL"], color="blue", linewidth=2, label="Outdoor Temp (°C)")

# Grey peak price region (reuse same logic)
_dt_series = pd.to_datetime(rl_df["Datetime"])
_hours_peak = _dt_series.dt.hour + _dt_series.dt.minute / 60.0
_peak_mask = (_hours_peak >= 17.0) & (_hours_peak < 21.0)
_dt_values_peak = _dt_series.to_numpy()
_peak_values = _peak_mask.to_numpy()

_in_peak = False
_peak_start = None
_added_peak_legend = False

for _dt, _peak in zip(_dt_values_peak, _peak_values):
    if _peak and not _in_peak:
        _in_peak = True
        _peak_start = _dt
    elif not _peak and _in_peak:
        if not _added_peak_legend:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0, label="Peak price period")
            _added_peak_legend = True
        else:
            plt.axvspan(_peak_start, _dt, color="grey", alpha=0.12, zorder=0)
        _in_peak = False

if _in_peak and _peak_start is not None:
    if not _added_peak_legend:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0, label="Peak price period")
    else:
        plt.axvspan(_peak_start, _dt_values_peak[-1], color="grey", alpha=0.12, zorder=0)

plt.xlabel("Hour of Day", fontsize=16)
plt.ylabel("Temperature (°C)", fontsize=16)
plt.xlim(rl_df["Datetime"].min(), rl_df["Datetime"].max())
plt.ylim(-15, 35)
plt.yticks(np.arange(-15, 36, 5), fontsize=14)
ax3 = plt.gca()
ax3.xaxis.set_major_locator(mdates.HourLocator(interval=1))
ax3.xaxis.set_major_formatter(mdates.DateFormatter("%H"))
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(fontsize=14)
plt.legend(loc='upper right', ncol=4, fontsize=11)
plt.tight_layout()
plt.show()


In [None]:
# Create a simple evaluation environment (1-day episode)
env = make_env("2018-02-01 00:00:00", "1 days")

# Print the full action space object
print("Action Space:", env.action_space)

# Print the specific minimum and maximum values
print("Minimum Allowed Setpoint:", env.action_space.low)
print("Maximum Allowed Setpoint:", env.action_space.high)

env.close()