Code from RLlib docs:

In [1]:
# Import the RL algorithm (Trainer) we would like to use.
from ray.rllib.agents.ppo import PPOTrainer

# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "Taxi-v3",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 2,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "tf",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    }
}

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
for _ in range(3):
    print(trainer.train())

# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()


2022-03-14 10:27:33,863	INFO trainable.py:125 -- Trainable.setup took 22.767 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


{'episode_reward_max': -650.0, 'episode_reward_min': -938.0, 'episode_reward_mean': -774.65, 'episode_len_mean': 200.0, 'episode_media': {}, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-686.0, -794.0, -785.0, -713.0, -758.0, -758.0, -893.0, -713.0, -650.0, -938.0, -821.0, -794.0, -776.0, -839.0, -785.0, -794.0, -677.0, -785.0, -722.0, -812.0], 'episode_lengths': [200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.05672068550609338, 'mean_inference_ms': 0.25350275425717933, 'mean_action_processing_ms': 0.025313654760906894, 'mean_env_wait_ms': 0.023519855806197247, 'mean_env_render_ms': 0.0}, 'off_policy_estimator': {}, 'num_healthy_workers': 2, 'timesteps_total': 4000, 'timesteps_this_iter': 4000, 'agent_timesteps_total': 4000, 'timers': {'sample_time_ms': 1063.943, 'sample_t



[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[35mR[0m: | : :G|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | |[43m [0m: | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |[34;1mB[0m: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (Dropoff)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[35mR[0m: | : :G|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | |[43m [0m: | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |[34;1mB[0m: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (Pickup)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[35mR[0m: | : :G|
[2m[36m(Rollou

{'evaluation': {'episode_reward_max': -59.0,
  'episode_reward_min': -605.0,
  'episode_reward_mean': -469.4,
  'episode_len_mean': 183.5,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [-560.0,
    -470.0,
    -479.0,
    -59.0,
    -497.0,
    -479.0,
    -605.0,
    -533.0,
    -524.0,
    -488.0],
   'episode_lengths': [200, 200, 200, 35, 200, 200, 200, 200, 200, 200]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.039223881848237614,
   'mean_inference_ms': 0.21851880877625707,
   'mean_action_processing_ms': 0.02001144787325059,
   'mean_env_wait_ms': 0.018819874408198333,
   'mean_env_render_ms': 0.029386258592792584},
  'off_policy_estimator': {},
  'timesteps_this_iter': 0}}

[2m[36m(RolloutWorker pid=49937)[0m   (Pickup)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[34;1m[43mR[0m[0m: | : :[35mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (West)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[34;1m[43mR[0m[0m: | : :[35mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (North)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49




[2m[36m(RolloutWorker pid=49937)[0m | :[43m [0m: : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (West)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[35mR[0m: | : :[34;1mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m |[43m [0m: : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (West)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[35mR[0m: | : :[34;1mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m |[43m [0m: | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWor




[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |R:[42m_[0m| : :[35mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (North)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |R:[42m_[0m| : :[35mG[0m|
[2m[36m(RolloutWorker pid=49937)[0m | : | : : |
[2m[36m(RolloutWorker pid=49937)[0m | : : : : |
[2m[36m(RolloutWorker pid=49937)[0m | | : | : |
[2m[36m(RolloutWorker pid=49937)[0m |Y| : |B: |
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m   (East)
[2m[36m(RolloutWorker pid=49937)[0m +---------+
[2m[36m(RolloutWorker pid=49937)[0m |[42mR[0m: | : :[35mG[0m|
[2m[36m(RolloutWorker pid=4993