In [1]:
import sys, os
parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)

from gym_homer.envs.test_env_v0 import HomerEnv
import pandas as pd
import numpy as np

import torch
import gym
from gym import spaces, wrappers
import envpool

from tianshou.utils import WandbLogger
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer, AsyncCollector
from tianshou.env import DummyVectorEnv, SubprocVectorEnv, ShmemVectorEnv
from tianshou.policy import PPOPolicy
from tianshou.trainer import onpolicy_trainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
i=0

In [2]:
path

'/home/ds1/w210/home-energy-optimizer'

In [3]:
log_path = path+'/logs/'
logger = WandbLogger(
    project="RL_project", 
    entity="w266_wra",
    name=f'debug_{i}',
    config=None)
logger.load(SummaryWriter(log_path))
i += 1

[34m[1mwandb[0m: Currently logged in as: [33matox120[0m ([33mw266_wra[0m). Use [1m`wandb login --relogin`[0m to force relogin




In [4]:
# Load Data
data = pd.read_csv(path+"/test_env_data.csv", index_col=False).fillna(0)
pd.DataFrame(data)

vectorised = False


n_train_envs = 20
n_test_envs = 20

env = HomerEnv(data=data, start_soc='empty')
train_envs = SubprocVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_train_envs)])
test_envs = SubprocVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_test_envs)])

In [5]:
# net is the shared head of the actor and the critic
hidden_sizes = [64,64]
lr_optimizer = 1e-4

net = Net(
    env.observation_space.shape, 
    hidden_sizes=hidden_sizes, 
    device=device
    )

actor = Actor(net, env.action_space.n, device=device).to(device)

critic = Critic(net, device=device).to(device)
actor_critic = ActorCritic(actor, critic)

# optimizer of the actor and the critic
optim = torch.optim.Adam(actor_critic.parameters(), lr=lr_optimizer)

In [6]:
# Since environment action space is discrete 
dist = torch.distributions.Categorical
policy = PPOPolicy(
    actor, 
    critic, 
    optim, 
    dist, 
    action_space=env.action_space, 
    deterministic_eval=True
)

In [7]:
vectorised = False

if vectorised:
    train_collector = AsyncCollector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
    test_collector = AsyncCollector(policy, test_envs)
else:
    train_collector = Collector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
    test_collector = Collector(policy, test_envs)

In [8]:
n_steps = 5000
n_max_epochs = 10
rep_per_collector = 10
eps_per_test = 10
batch_size = 256
n_steps_per_collect = 2000
reward_stop = 4

result = onpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    max_epoch=n_max_epochs,
    step_per_epoch=n_steps,
    repeat_per_collect=rep_per_collector,
    episode_per_test=eps_per_test,
    batch_size=batch_size,
    step_per_collect= n_steps_per_collect,
    stop_fn=lambda mean_reward: mean_reward >= reward_stop,
    logger=logger
)

Epoch #1: 6000it [00:00, 10738.95it/s, env_step=6000, len=11, loss=1.712, loss/clip=-0.006, loss/ent=1.094, loss/vf=3.457, n/ep=180, n/st=2000, rew=-2.71]


Epoch #1: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #2: 6000it [00:00, 10598.16it/s, env_step=12000, len=11, loss=0.664, loss/clip=-0.003, loss/ent=1.023, loss/vf=1.354, n/ep=180, n/st=2000, rew=-2.31]


Epoch #2: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #3: 6000it [00:00, 10618.38it/s, env_step=18000, len=11, loss=0.413, loss/clip=-0.005, loss/ent=0.916, loss/vf=0.853, n/ep=180, n/st=2000, rew=-2.06]


Epoch #3: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #4: 6000it [00:00, 10539.19it/s, env_step=24000, len=11, loss=0.355, loss/clip=-0.008, loss/ent=0.955, loss/vf=0.745, n/ep=180, n/st=2000, rew=-1.75]


Epoch #4: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #5: 6000it [00:00, 10441.72it/s, env_step=30000, len=11, loss=0.443, loss/clip=-0.010, loss/ent=0.930, loss/vf=0.924, n/ep=180, n/st=2000, rew=-1.16]


Epoch #5: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #6: 6000it [00:00, 10599.48it/s, env_step=36000, len=11, loss=0.629, loss/clip=-0.010, loss/ent=0.815, loss/vf=1.293, n/ep=180, n/st=2000, rew=-0.33]


Epoch #6: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #7: 6000it [00:00, 10261.46it/s, env_step=42000, len=11, loss=0.562, loss/clip=-0.012, loss/ent=0.630, loss/vf=1.161, n/ep=180, n/st=2000, rew=0.42]


Epoch #7: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #8: 6000it [00:00, 10528.94it/s, env_step=48000, len=11, loss=0.390, loss/clip=-0.009, loss/ent=0.441, loss/vf=0.807, n/ep=180, n/st=2000, rew=1.06]


Epoch #8: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #9: 6000it [00:00, 10420.02it/s, env_step=54000, len=11, loss=0.208, loss/clip=-0.005, loss/ent=0.319, loss/vf=0.433, n/ep=180, n/st=2000, rew=1.50]


Epoch #9: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #10: 6000it [00:00, 10532.52it/s, env_step=60000, len=11, loss=0.164, loss/clip=-0.004, loss/ent=0.249, loss/vf=0.341, n/ep=180, n/st=2000, rew=1.49]


Epoch #10: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


In [9]:
print(result)

{'duration': '6.25s', 'train_time/model': '2.86s', 'test_step': 1210, 'test_episode': 110, 'test_time': '0.51s', 'test_speed': '2367.05 step/s', 'best_reward': 1.7499999906867743, 'best_result': '1.75 ± 0.00', 'train_step': 60000, 'train_episode': 5440, 'train_time/collector': '2.88s', 'train_speed': '10460.63 step/s'}


In [10]:
# Let's watch its performance!
policy.eval()
result = test_collector.collect(n_episode=10, render=False)
print("Final reward: {}, length: {}".format(result["rews"].mean(), result["lens"].mean()))

Final reward: 1.7499999906867743, length: 11.0


In [11]:
print(result)

{'n/ep': 10, 'n/st': 110, 'rews': array([1.74999999, 1.74999999, 1.74999999, 1.74999999, 1.74999999,
       1.74999999, 1.74999999, 1.74999999, 1.74999999, 1.74999999]), 'lens': array([11, 11, 11, 11, 11, 11, 11, 11, 11, 11]), 'idxs': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 'rew': 1.7499999906867743, 'len': 11.0, 'rew_std': 0.0, 'len_std': 0.0}


In [12]:
## Not in dcs - also not sure if required. 
logger.wandb_run.finish()

0,1
global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
test/env_step,▁▂▂▃▄▅▅▆▇▇█
test/length,▁▁▁▁▁▁▁▁▁▁▁
test/length_std,▁▁▁▁▁▁▁▁▁▁▁
test/reward,▁▅▅▅▅██████
test/reward_std,▁▁▁▁▁▁▁▁▁▁▁
train/episode,▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁
train/length,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/reward,▁▁▁▂▂▂▂▂▂▃▃▃▄▃▄▄▄▅▅▆▆▇▇▇█▇████
update/loss,█▁

0,1
global_step,60000.0
test/env_step,60000.0
test/length,11.0
test/length_std,0.0
test/reward,1.75
test/reward_std,0.0
train/episode,180.0
train/length,11.0
train/reward,1.48861
update/loss,0.16423
