In [1]:
import gym
import stable_baselines3 as sb3
from src.pv_env import PVEnv, PVEnvDiscrete
from src.reward import RewardDeltaPower
import os

PV_PARAMS_PATH = os.path.join("parameters", "01_pvarray.json")
WEATHER_TRAIN_PATH = os.path.join("data", "weather_sim.csv")
WEATHER_TEST_PATH = os.path.join("data", "weather_real.csv")
PVARRAY_CKP_PATH = os.path.join("data", "01_pvarray_iv.json")
AGENT_CKP_PATH = os.path.join("models", "02_mppt_ac.tar")
LEARNING_RATE = 0.00001
ENTROPY_BETA = 0.001
GAMMA = 0.9
N_STEPS = 4
BATCH_SIZE = 32

# env = PVEnv.from_file(
#     PV_PARAMS_PATH,
#     WEATHER_TRAIN_PATH,
#     pvarray_ckp_path=PVARRAY_CKP_PATH,
#     states=["v_norm", "i_norm", "deg"],
#     reward_fn=RewardDeltaPower(1, 0.9),
# )
# test_env = PVEnv.from_file(
#     PV_PARAMS_PATH,
#     WEATHER_TRAIN_PATH,
#     pvarray_ckp_path=PVARRAY_CKP_PATH,
#     states=["v_norm", "i_norm", "deg"],
#     reward_fn=RewardDeltaPower(1, 0.9),
# )

env = gym.make('Pendulum-v0')
test_env = gym.make('Pendulum-v0')

# env = sb3.common.vec_env.VecNormalize()
# vec_env = sb3.common.vec_env.dummy_vec_env.DummyVecEnv([lambda: env])
# norm_env = sb3.common.vec_env.VecNormalize(vec_env)

In [2]:
def test_agent(agent, test_env):
    obs = test_env.reset()
    for i in range(832):
        test_env.render()
        action, _ = agent.predict(obs, deterministic=True)
        obs, reward, done, info = test_env.step(action)
        if done:
            break
    test_env.close()
    # test_env.render_vs_true(po=True)

In [23]:
a2c_agent = sb3.A2C(
    policy="MlpPolicy",
    env=env,
    learning_rate=sb3.common.utils.get_schedule_fn()
    device='cpu',
    ent_coef=0.0,
    n_steps=8,
    gae_lambda=0.9,
    vf_coef=0.4,
    use_sde=True,
    gamma=0.99,
    policy_kwargs=dict(log_std_init=-2, ortho_init=False),
    # normalize_advantage=True,
    # policy_kwargs={'net_arch':[dict(pi=[128, 128, 128], vf=[128, 128, 128])]}
    )
ddpg_agent = sb3.DDPG(
    policy="MlpPolicy",
    env=env,
    learning_rate=1e-4,
    device='cpu',
    )
ppo_agent = sb3.PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=1e-4,
    device='cpu',
    )
sac_agent = sb3.SAC(
    policy="MlpPolicy",
    env=env,
    learning_rate=1e-4,
    device='cpu',
    )
td3_agent = sb3.TD3(
    policy="MlpPolicy",
    env=env,
    learning_rate=1e-4,
    device='cpu',
    )


AssertionError: 

In [20]:
%%time
a2c_agent.learn(
    total_timesteps=10000,
    eval_env=test_env,
    n_eval_episodes=10,
    eval_freq=1000,
    reset_num_timesteps=False,
)
test_agent(a2c_agent, test_env)

Eval num_timesteps=1033000, episode_reward=-748.41 +/- 373.32
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1034000, episode_reward=-664.63 +/- 312.18
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1035000, episode_reward=-954.52 +/- 68.35
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1036000, episode_reward=-916.48 +/- 196.36
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1037000, episode_reward=-847.95 +/- 317.50
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1038000, episode_reward=-646.45 +/- 296.36
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1039000, episode_reward=-548.70 +/- 270.06
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1040000, episode_reward=-203.87 +/- 109.94
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1041000, episode_reward=-268.34 +/- 182.07
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1042000, episode_reward=-2

In [19]:
test_agent(a2c_agent, test_env)

In [14]:
a2c_agent.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=3, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=3, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=1, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [7]:
%%time
ddpg_agent.learn(
    total_timesteps=10_000,
    eval_env=test_env,
    n_eval_episodes=10,
    eval_freq=100,
    reset_num_timesteps=False,
)
test_agent(ddpg_agent, test_env)

Eval num_timesteps=3300, episode_reward=-1295.90 +/- 57.43
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3400, episode_reward=-1309.15 +/- 109.91
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3500, episode_reward=-1277.39 +/- 156.85
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3600, episode_reward=-1294.97 +/- 80.52
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3700, episode_reward=-1349.82 +/- 162.04
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3800, episode_reward=-1344.39 +/- 91.88
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3900, episode_reward=-1288.44 +/- 75.14
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4000, episode_reward=-1246.53 +/- 79.28
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4100, episode_reward=-1268.14 +/- 197.05
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4200, episode_reward=-1354.15 +/- 31.90
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4

In [8]:
%%time
ppo_agent.learn(
    total_timesteps=10_000,
    eval_env=test_env,
    n_eval_episodes=10,
    eval_freq=100,
    reset_num_timesteps=False,
)
test_agent(ppo_agent, test_env)

Eval num_timesteps=100, episode_reward=-1220.50 +/- 343.19
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200, episode_reward=-1170.57 +/- 358.56
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=300, episode_reward=-1213.08 +/- 253.34
Episode length: 200.00 +/- 0.00
Eval num_timesteps=400, episode_reward=-1121.74 +/- 442.49
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=500, episode_reward=-1177.12 +/- 348.72
Episode length: 200.00 +/- 0.00
Eval num_timesteps=600, episode_reward=-1098.58 +/- 368.17
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=700, episode_reward=-1420.80 +/- 299.05
Episode length: 200.00 +/- 0.00
Eval num_timesteps=800, episode_reward=-1278.13 +/- 419.34
Episode length: 200.00 +/- 0.00
Eval num_timesteps=900, episode_reward=-1315.69 +/- 255.85
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1000, episode_reward=-1184.39 +/- 325.54
Episode length: 200.00 +/- 0.00
E

In [9]:
%%time
sac_agent.learn(
    total_timesteps=10_000,
    eval_env=test_env,
    n_eval_episodes=10,
    eval_freq=100,
    reset_num_timesteps=False,
)
test_agent(sac_agent, test_env)

Eval num_timesteps=100, episode_reward=-1285.18 +/- 279.17
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200, episode_reward=-1125.35 +/- 122.73
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=300, episode_reward=-1287.15 +/- 163.55
Episode length: 200.00 +/- 0.00
Eval num_timesteps=400, episode_reward=-1429.65 +/- 125.88
Episode length: 200.00 +/- 0.00
Eval num_timesteps=500, episode_reward=-1441.35 +/- 160.52
Episode length: 200.00 +/- 0.00
Eval num_timesteps=600, episode_reward=-1616.27 +/- 108.92
Episode length: 200.00 +/- 0.00
Eval num_timesteps=700, episode_reward=-1748.70 +/- 69.85
Episode length: 200.00 +/- 0.00
Eval num_timesteps=800, episode_reward=-1773.50 +/- 103.19
Episode length: 200.00 +/- 0.00
Eval num_timesteps=900, episode_reward=-1785.86 +/- 93.70
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1000, episode_reward=-1801.09 +/- 95.03
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1100, episode_reward=-1793.29

In [10]:
%%time
td3_agent.learn(
    total_timesteps=10_000,
    eval_env=test_env,
    n_eval_episodes=10,
    eval_freq=100,
    reset_num_timesteps=False,
)
test_agent(td3_agent, test_env)

Eval num_timesteps=100, episode_reward=-1239.03 +/- 153.52
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200, episode_reward=-1277.18 +/- 266.23
Episode length: 200.00 +/- 0.00
Eval num_timesteps=300, episode_reward=-1848.85 +/- 74.67
Episode length: 200.00 +/- 0.00
Eval num_timesteps=400, episode_reward=-1797.95 +/- 91.26
Episode length: 200.00 +/- 0.00
Eval num_timesteps=500, episode_reward=-1451.33 +/- 131.93
Episode length: 200.00 +/- 0.00
Eval num_timesteps=600, episode_reward=-1474.77 +/- 155.30
Episode length: 200.00 +/- 0.00
Eval num_timesteps=700, episode_reward=-1432.77 +/- 124.87
Episode length: 200.00 +/- 0.00
Eval num_timesteps=800, episode_reward=-1461.72 +/- 158.48
Episode length: 200.00 +/- 0.00
Eval num_timesteps=900, episode_reward=-959.18 +/- 228.22
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1000, episode_reward=-1064.03 +/- 211.57
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1100, episode_reward=-1156.45