In [15]:
import sys, os
parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)

from gym_homer.envs.test_env_v0 import HomerEnv
import pandas as pd
import numpy as np

import torch
import gym
from gym import spaces, wrappers
import envpool


from tianshou.data import Collector, VectorReplayBuffer, AsyncCollector
from tianshou.env import DummyVectorEnv, SubprocVectorEnv, ShmemVectorEnv
from tianshou.policy import PPOPolicy
from tianshou.trainer import onpolicy_trainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
# Load Data
data = pd.read_csv(path+"/test_env_data.csv", index_col=False).fillna(0)
pd.DataFrame(data)

vectorised = False

if vectorised:
    n_train_envs = 4
    n_test_envs = 4

    env = HomerEnv(data=data, start_soc='empty')
    train_envs = ShmemVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_train_envs)])
    test_envs = ShmemVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_test_envs)])
    
else:
    n_train_envs = 8
    n_test_envs = 8

    env = HomerEnv(data=data, start_soc='empty')
    train_envs = SubprocVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_train_envs)])
    test_envs = SubprocVectorEnv([lambda: HomerEnv(data=data, start_soc='empty') for _ in range(n_test_envs)])

In [17]:
# net is the shared head of the actor and the critic
hidden_sizes = [64,64]
lr_optimizer = 1e-4

net = Net(
    env.observation_space.shape, 
    hidden_sizes=hidden_sizes, 
    device=device
    )

actor = Actor(net, env.action_space.n, device=device).to(device)

critic = Critic(net, device=device).to(device)
actor_critic = ActorCritic(actor, critic)

# optimizer of the actor and the critic
optim = torch.optim.Adam(actor_critic.parameters(), lr=lr_optimizer)

In [18]:
# Since environment action space is discrete 
dist = torch.distributions.Categorical
policy = PPOPolicy(
    actor, 
    critic, 
    optim, 
    dist, 
    action_space=env.action_space, 
    deterministic_eval=True
)

In [19]:
vectorised = False

if vectorised:
    train_collector = AsyncCollector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
    test_collector = AsyncCollector(policy, test_envs)
else:
    train_collector = Collector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
    test_collector = Collector(policy, test_envs)

In [20]:
n_steps = 5000
n_max_epochs = 10
rep_per_collector = 10
eps_per_test = 10
batch_size = 256
n_steps_per_collect = 2000
reward_stop = 4

result = onpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    max_epoch=n_max_epochs,
    step_per_epoch=n_steps,
    repeat_per_collect=rep_per_collector,
    episode_per_test=eps_per_test,
    batch_size=batch_size,
    step_per_collect= n_steps_per_collect,
    stop_fn=lambda mean_reward: mean_reward >= reward_stop,
)

Epoch #1: 6000it [00:00, 7810.36it/s, env_step=6000, len=11, loss=1.799, loss/clip=-0.006, loss/ent=1.094, loss/vf=3.632, n/ep=184, n/st=2000, rew=-2.58]                                                                                                                      


Epoch #1: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #2: 6000it [00:00, 7988.76it/s, env_step=12000, len=11, loss=0.707, loss/clip=-0.005, loss/ent=0.995, loss/vf=1.442, n/ep=184, n/st=2000, rew=-2.11]                                                                                                                     


Epoch #2: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #3: 6000it [00:00, 7964.16it/s, env_step=18000, len=11, loss=0.426, loss/clip=-0.010, loss/ent=0.930, loss/vf=0.892, n/ep=184, n/st=2000, rew=-1.95]                                                                                                                     


Epoch #3: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #4: 6000it [00:00, 7971.63it/s, env_step=24000, len=11, loss=0.439, loss/clip=-0.008, loss/ent=0.924, loss/vf=0.913, n/ep=176, n/st=2000, rew=-1.52]                                                                                                                     


Epoch #4: test_reward: -2.175000 ± 0.000000, best_reward: -2.175000 ± 0.000000 in #1


Epoch #5: 6000it [00:00, 7967.53it/s, env_step=30000, len=11, loss=0.612, loss/clip=-0.010, loss/ent=0.855, loss/vf=1.260, n/ep=176, n/st=2000, rew=-0.59]                                                                                                                     


Epoch #5: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #6: 6000it [00:00, 7989.74it/s, env_step=36000, len=11, loss=0.642, loss/clip=-0.016, loss/ent=0.688, loss/vf=1.330, n/ep=184, n/st=2000, rew=0.12]                                                                                                                      


Epoch #6: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #7: 6000it [00:00, 7972.68it/s, env_step=42000, len=11, loss=0.419, loss/clip=-0.011, loss/ent=0.467, loss/vf=0.868, n/ep=184, n/st=2000, rew=1.09]                                                                                                                      


Epoch #7: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #8: 6000it [00:00, 7786.36it/s, env_step=48000, len=11, loss=0.273, loss/clip=-0.007, loss/ent=0.307, loss/vf=0.566, n/ep=184, n/st=2000, rew=1.32]                                                                                                                      


Epoch #8: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #9: 6000it [00:00, 7983.85it/s, env_step=54000, len=11, loss=0.108, loss/clip=-0.004, loss/ent=0.237, loss/vf=0.228, n/ep=184, n/st=2000, rew=1.56]                                                                                                                      


Epoch #9: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5


Epoch #10: 6000it [00:00, 7942.38it/s, env_step=60000, len=11, loss=0.068, loss/clip=-0.003, loss/ent=0.185, loss/vf=0.145, n/ep=176, n/st=2000, rew=1.64]                                                                                                                     

Epoch #10: test_reward: 1.750000 ± 0.000000, best_reward: 1.750000 ± 0.000000 in #5





In [27]:
print(result)

{'duration': '10.43s', 'train_time/model': '2.80s', 'test_step': 1210, 'test_episode': 110, 'test_time': '0.21s', 'test_speed': '5696.36 step/s', 'best_reward': 1.7499999906867743, 'best_result': '1.75 ± 0.00', 'train_step': 60000, 'train_episode': 5452, 'train_time/collector': '7.42s', 'train_speed': '5871.46 step/s'}


In [None]:
# Let's watch its performance!
policy.eval()
result = test_collector.collect(n_episode=10, render=False)
print("Final reward: {}, length: {}".format(result["rews"].mean(), result["lens"].mean()))

In [None]:
print(result)