In [1]:
import numpy as np
import pandas as pd

from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor

from envs.bess_env import BatteryEnv
from utils.csv_handler import load_price_data, save_records
from utils.eval_handler import evaluate_rollout
from utils.forecast_scenario import ForecastScenarioGenerator

In [2]:
# --------------------------------------------------
# Experiment config
# --------------------------------------------------
training_steps_list = [10_000, 20_000, 40_000, 60_000, 80_000, 100_000]
n_runs = 5
base_seed = 10

forecast_horizon_hours = 3.0
dt_hours = 0.25
H = int(round(forecast_horizon_hours / dt_hours))

# --------------------------------------------------
# Load data
# --------------------------------------------------
dfp_train, price_train, ts_train = load_price_data(
    csv_path="../../../data/electricity_price/dayahead_2024_11.csv",
    resolution="15min",
)

dfp_eval, price_eval, ts_eval = load_price_data(
    csv_path="../../../data/electricity_price/dayahead_2025_11.csv",
    resolution="15min",
    time_range=("2025-11-01", "2025-11-07"),
)

# --------------------------------------------------
# Forecast scenarios (fixed across runs)
# --------------------------------------------------
price_scenario_gen = ForecastScenarioGenerator(
    horizon_steps=H,
    sigma0=0.01,
    sigmaH=0.06,
    schedule="sqrt",
    base_seed=1234,
)

# --------------------------------------------------
# Storage for results
# --------------------------------------------------
records = []

In [3]:
# ==================================================
# Main experiment loop
# ==================================================
for total_steps in training_steps_list:
    print(f"\n=== Training steps: {total_steps} ===")

    run_rewards = []

    for run_id in range(n_runs):
        seed = base_seed + run_id
        print(f"  Run {run_id+1}/{n_runs} (seed={seed})")

        # -----------------------------
        # Training env
        # -----------------------------
        train_env_raw = BatteryEnv(
            price_series=price_train,
            timestamps=ts_train,
            dt_hours=dt_hours,
            capacity_kWh=50.0,
            p_max_kW=10.0,
            use_discrete_actions=True,
            use_price_forecast=True,
            forecast_horizon_hours=forecast_horizon_hours,
            episode_days=7.0,
            random_start=True,
            random_seed=seed,
            price_scenario_gen=price_scenario_gen,
        )

        train_env = Monitor(train_env_raw)

        model = DQN(
            "MlpPolicy",
            train_env,
            learning_rate=1e-3,
            buffer_size=50_000,
            learning_starts=1_000,
            batch_size=64,
            gamma=0.99,
            train_freq=4,
            target_update_interval=1_000,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05,
            exploration_fraction=0.3,
            verbose=0,
            seed=seed,
        )

        model.learn(total_timesteps=total_steps)

        # -----------------------------
        # Evaluation env (fixed)
        # -----------------------------
        eval_env = BatteryEnv(
            price_series=price_eval,
            timestamps=ts_eval,
            dt_hours=dt_hours,
            capacity_kWh=50.0,
            p_max_kW=10.0,
            use_discrete_actions=True,
            use_price_forecast=True,
            forecast_horizon_hours=forecast_horizon_hours,
            episode_days=7.0,
            random_start=False,
            random_seed=999,
            price_scenario_gen=price_scenario_gen,
            scenario_id=0,
            vary_scenario_per_episode=False,
        )

        rollout = evaluate_rollout(model=model, env=eval_env)

        total_reward = np.sum(rollout["reward"])
        run_rewards.append(total_reward)

        print(f"    → Run reward = {total_reward:.2f}")

        records.append({
            "agent": "DQN",
            "training_steps": total_steps,
            "run_id": run_id,
            "seed": seed,
            "total_reward": total_reward,
        })

    # -----------------------------
    # Summary for this step size
    # -----------------------------
    mean_r = np.mean(run_rewards)
    std_r = np.std(run_rewards)

    print(f"  → mean reward = {mean_r:.2f} ± {std_r:.2f}")

# --------------------------------------------------
# Convert to DataFrame
# --------------------------------------------------
df_results = pd.DataFrame(records)

summary = (
    df_results
    .groupby("training_steps")["total_reward"]
    .agg(["mean", "std"])
    .reset_index()
)

print("\n=== Summary ===")
print(summary)


=== Training steps: 10000 ===
  Run 1/5 (seed=10)
Episode finished after 672 steps
    → Run reward = 0.73
  Run 2/5 (seed=11)
Episode finished after 672 steps
    → Run reward = 0.41
  Run 3/5 (seed=12)
Episode finished after 672 steps
    → Run reward = 0.65
  Run 4/5 (seed=13)
Episode finished after 672 steps
    → Run reward = 0.76
  Run 5/5 (seed=14)
Episode finished after 672 steps
    → Run reward = 0.50
  → mean reward = 0.61 ± 0.13

=== Training steps: 20000 ===
  Run 1/5 (seed=10)
Episode finished after 672 steps
    → Run reward = -1.88
  Run 2/5 (seed=11)
Episode finished after 672 steps
    → Run reward = 2.04
  Run 3/5 (seed=12)
Episode finished after 672 steps
    → Run reward = 3.78
  Run 4/5 (seed=13)
Episode finished after 672 steps
    → Run reward = 3.81
  Run 5/5 (seed=14)
Episode finished after 672 steps
    → Run reward = 1.04
  → mean reward = 1.76 ± 2.10

=== Training steps: 40000 ===
  Run 1/5 (seed=10)
Episode finished after 672 steps
    → Run reward = -1.3

In [4]:
save_records(
    records=records,
    out_path="results/learning_steps_records.csv",
    experiment_id="dqn_learning_steps_v1",
)

[save_experiment_records] Saved 30 new rows (total=90) to results/learning_steps_records.csv
