https://github.com/wwxFromTju/awesome-reinforcement-learning-lib#rl-library

In [13]:
import gym
import tianshou as ts
import torch, numpy as np
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger

In [14]:
ENVIRONMENT_NAME = 'CartPole-v1'
TRAIN_ENV_COUNT = 1
TEST_ENV_COUNT = 1

In [15]:
env = gym.make(ENVIRONMENT_NAME)
train_envs = ts.env.DummyVectorEnv([lambda: gym.make(ENVIRONMENT_NAME) for _ in range(TEST_ENV_COUNT)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make(ENVIRONMENT_NAME) for _ in range(TRAIN_ENV_COUNT)])
writer = SummaryWriter('assets/logs/dqn')
logger = TensorboardLogger(writer)

In [16]:
class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)

        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))

        return logits, state

In [17]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

In [18]:
policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)

In [19]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

In [20]:
result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold)
print(f'Finished training! Use {result["duration"]}')

Epoch #1: 10001it [00:19, 503.43it/s, env_step=10000, len=227, loss=0.310, n/ep=1, n/st=10, rew=227.00]                           


Epoch #1: test_reward: 211.350000 ± 38.492954, best_reward: 211.350000 ± 38.492954 in #1


Epoch #2: 10001it [00:20, 487.37it/s, env_step=20000, len=159, loss=0.034, n/ep=0, n/st=10, rew=159.00]                           


Epoch #2: test_reward: 219.020000 ± 58.385783, best_reward: 219.020000 ± 58.385783 in #2


Epoch #3: 10001it [00:19, 503.32it/s, env_step=30000, len=184, loss=0.029, n/ep=0, n/st=10, rew=184.00]                           


Epoch #3: test_reward: 200.330000 ± 27.855360, best_reward: 219.020000 ± 58.385783 in #2


Epoch #4: 10001it [00:21, 472.76it/s, env_step=40000, len=169, loss=0.057, n/ep=0, n/st=10, rew=169.00]                           


Epoch #4: test_reward: 137.500000 ± 12.351113, best_reward: 219.020000 ± 58.385783 in #2


Epoch #5: 10001it [00:20, 489.30it/s, env_step=50000, len=297, loss=0.051, n/ep=0, n/st=10, rew=297.00]                           


Epoch #5: test_reward: 202.510000 ± 12.994226, best_reward: 219.020000 ± 58.385783 in #2


Epoch #6: 10001it [02:14, 74.41it/s, env_step=60000, len=288, loss=0.019, n/ep=0, n/st=10, rew=288.00]                            


Epoch #6: test_reward: 381.540000 ± 60.488746, best_reward: 381.540000 ± 60.488746 in #6


Epoch #7: 10001it [00:59, 169.26it/s, env_step=70000, len=310, loss=0.043, n/ep=0, n/st=10, rew=310.00]                           


Epoch #7: test_reward: 253.590000 ± 46.560519, best_reward: 381.540000 ± 60.488746 in #6


Epoch #8: 10001it [00:37, 263.88it/s, env_step=80000, len=177, loss=0.006, n/ep=0, n/st=10, rew=177.00]                           


Epoch #8: test_reward: 175.080000 ± 9.666106, best_reward: 381.540000 ± 60.488746 in #6


Epoch #9: 10001it [00:19, 512.05it/s, env_step=90000, len=185, loss=0.008, n/ep=0, n/st=10, rew=185.00]                           


Epoch #9: test_reward: 148.670000 ± 5.658719, best_reward: 381.540000 ± 60.488746 in #6
Finished training! Use 507.58s
