In [47]:
import numpy as np
import torch as t
import pytorch_lightning as pl
import logging
import uuid

In [4]:
logging.getLogger("reagent").setLevel(logging.ERROR)

In [53]:
from reagent.gym.envs import Gym
from reagent.core.parameters import RLParameters
from reagent.optimizer.uninferrable_optimizers import Adam
from reagent.optimizer.union import Optimizer__Union
from reagent.training import DQNTrainerParameters
from reagent.net_builder.unions import DiscreteDQNNetBuilder__Union
from reagent.net_builder.discrete_dqn.fully_connected import FullyConnected
from reagent.model_managers.discrete.discrete_dqn import DiscreteDQN
from reagent.gym.utils import build_normalizer, fill_replay_buffer
from reagent.replay_memory.circular_replay_buffer import ReplayBuffer
from reagent.gym.policies.random_policies import make_random_policy_for_env
from reagent.gym.agents.agent import Agent
from reagent.gym.datasets.replay_buffer_dataset import ReplayBufferDataset
from reagent.gym.runners.gymrunner import evaluate_for_n_episodes

In [5]:
env = Gym(env_name="CartPole-v0")

In [7]:
rl_params = RLParameters(
    gamma=0.99, 
    epsilon=0.1, 
    target_update_rate=0.2, 
    maxq_learning=True, 
    reward_boost=None, 
    temperature=1.0, 
    softmax_policy=False, 
    use_seq_num_diff_as_time_diff=False, 
    q_network_loss='mse', 
    set_missing_value_to_zero=False, 
    tensorboard_logging_freq=0, 
    predictor_atol_check=0.0, 
    predictor_rtol_check=5e-05, 
    time_diff_unit_length=1.0, 
    multi_steps=None, 
    ratio_different_predictions_tolerance=0.0
)

In [13]:
adam = Adam(
    lr_schedulers=[], 
    lr=0.05, 
    betas=(0.9, 0.999), 
    eps=1e-08, 
    weight_decay=0.0, 
    amsgrad=False
)
optim = Optimizer__Union(Adam=adam)

In [15]:
trainer_params = DQNTrainerParameters(
    actions=["0", "1"],
    rl=rl_params,
    double_q_learning=True,
    bcq=None,
    minibatch_size=1024,
    minibatches_per_step=1,
    optimizer=optim
)

In [17]:
net_builder = DiscreteDQNNetBuilder__Union(
    Dueling=None, 
    FullyConnected=FullyConnected(
        sizes=[128, 64],
        activations=["leaky_relu", "leaky_relu"],
        dropout_ratio=0.0,
        use_batch_norm=False
    ),
    FullyConnectedWithEmbedding=None
)

In [18]:
cpe_net_builder = DiscreteDQNNetBuilder__Union(
    Dueling=None,
    FullyConnected=FullyConnected(
        sizes=[256, 128],
        activations=["relu", "relu"],
        dropout_ratio=0.0,
        use_batch_norm=False
    ),
    FullyConnectedWithEmbedding=None
)

In [21]:
dqn = DiscreteDQN(
    trainer_param=trainer_params,
    net_builder=net_builder,
    cpe_net_builder=cpe_net_builder
)

In [22]:
replay_memory_size = 100000
train_every_ts = 1
train_after_ts = 20000
num_train_episodes = 30
passing_score_bar = 100.0
num_eval_episodes = 20
use_gpu = False
minibatch_size = 512

In [24]:
normalization = build_normalizer(env)

In [28]:
trainer = dqn.build_trainer(
    use_gpu=False, 
    normalization_data_map=normalization
)

In [30]:
training_policy = dqn.create_policy(trainer, serving=False)

In [32]:
replay_buffer = ReplayBuffer(
    replay_capacity=replay_memory_size, batch_size=minibatch_size
)

In [34]:
device = t.device("cpu")

In [36]:
random_policy = make_random_policy_for_env(env)

In [38]:
agent = Agent.create_for_env(env, policy=random_policy)

In [39]:
fill_replay_buffer(
    env=env, 
    replay_buffer=replay_buffer, 
    desired_size=train_after_ts, 
    agent=agent
)

Filling replay buffer from 0 to size 20000: 100%|██████████| 20000/20000 [00:09<00:00, 2037.28it/s]


In [40]:
agent = Agent.create_for_env(
    env, 
    policy=training_policy, 
    device=device
)

In [43]:
dataset = ReplayBufferDataset.create_for_trainer(
    trainer,
    env,
    agent,
    replay_buffer,
    batch_size=minibatch_size,
    training_frequency=train_every_ts,
    num_episodes=num_train_episodes,
    max_steps=200,
    device=device,
)

INFO:reagent.gym.preprocessors.trainer_preprocessor:Deriving trainer_preprocessor from OrderedDict([('training_batch', <Parameter "training_batch: reagent.core.types.DiscreteDqnInput">), ('batch_idx', <Parameter "batch_idx: int">)])


In [45]:
data_loader = t.utils.data.DataLoader(
    dataset, 
    collate_fn=lambda batch: batch[0]
)

In [48]:
pl_trainer = pl.Trainer(
    max_epochs=1,
    gpus=int(use_gpu),
    deterministic=True,
    default_root_dir=f"lightning_log_{str(uuid.uuid4())}",
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [49]:
pl_trainer.fit(trainer, data_loader)

  rank_zero_warn(f'you defined a {step_name} but have no {loader_name}. Skipping {stage} loop')

  | Name                 | Type              | Params
-----------------------------------------------------------
0 | q_network            | FullyConnectedDQN | 9.0 K 
1 | q_network_target     | FullyConnectedDQN | 9.0 K 
2 | reward_network       | FullyConnectedDQN | 34.4 K
3 | q_network_cpe        | FullyConnectedDQN | 34.4 K
4 | q_network_cpe_target | FullyConnectedDQN | 34.4 K
-----------------------------------------------------------
121 K     Trainable params
0         Non-trainable params
121 K     Total params
0.485     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]


Training episode: 1, total episode reward = 18.0

Training episode: 2, total episode reward = 9.0

Training episode: 3, total episode reward = 10.0

Training episode: 4, total episode reward = 102.0

Training episode: 5, total episode reward = 200.0

Training episode: 6, total episode reward = 200.0

Training episode: 7, total episode reward = 104.0

Training episode: 8, total episode reward = 145.0

Training episode: 9, total episode reward = 108.0

Training episode: 10, total episode reward = 132.0

Training episode: 11, total episode reward = 94.0

Training episode: 12, total episode reward = 98.0

Training episode: 13, total episode reward = 79.0

Training episode: 14, total episode reward = 82.0

Training episode: 15, total episode reward = 85.0

Training episode: 16, total episode reward = 94.0

Training episode: 17, total episode reward = 200.0

Training episode: 18, total episode reward = 200.0

Training episode: 19, total episode reward = 200.0

Training episode: 20, total ep

In [50]:
serving_policy = dqn.create_policy(
    trainer, 
    serving=True, 
    normalization_data_map=normalization
)

  input.shape == input_presence_byte.shape


In [51]:
agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

In [54]:
eval_rewards = evaluate_for_n_episodes(
    n=num_eval_episodes,
    env=env,
    agent=agent,
    max_steps=env.max_steps,
    num_processes=1,
).squeeze(1)

In [55]:
np.mean(eval_rewards)

200.0

In [56]:
eval_rewards

array([200., 200., 200., 200., 200., 200., 200., 200., 200., 200., 200.,
       200., 200., 200., 200., 200., 200., 200., 200., 200.])