In [1]:
import gym
import numpy as np
from gym import error, spaces, utils
from gym.envs.registration import register

"""
First kind of termination: true termination
As an example, the true termination of the Breakout game in Atari 2600
comes when you lose all of your lives.
"""


class TestEnv(gym.Env):
    """
    A simple env that always ends after 10 timesteps, which 
    can be considered as the ``true termination'' from the environment.
    At each timestep, its observation is its internal timestep
    """

    def __init__(self):
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=np.array([-1.]),
                                            high=np.array([10.]))

    def step(self, action):
        self.t += 1
        return np.array([0.])+self.t, 1 if (self.t != 10 and self.t != 8) else 0, self.t == 10, {}

    def reset(self):
        self.t = 0
        return np.array([0.])+self.t


if "TestEnv-v0" not in gym.envs.registry.env_specs:
    register(
        "TestEnv-v0",
        entry_point='__main__:TestEnv'
    )

env = gym.make("TestEnv-v0")
print(f"env is {env}")
for i in range(2):
    all_obs = [env.reset()]
    while True:
        obs, reward, done, info = env.step(env.action_space.sample())
        all_obs += [obs]
        if done:
            print(f"all observation in episode {i}:")
            print(all_obs)
            print("true termination")
            print()
            break
print("=========")


env is <OrderEnforcing<TestEnv<TestEnv-v0>>>
all observation in episode 0:
[array([0.]), array([1.]), array([2.]), array([3.]), array([4.]), array([5.]), array([6.]), array([7.]), array([8.]), array([9.]), array([10.])]
true termination

all observation in episode 1:
[array([0.]), array([1.]), array([2.]), array([3.]), array([4.]), array([5.]), array([6.]), array([7.]), array([8.]), array([9.]), array([10.])]
true termination



  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [2]:
"""
Second kind of termination: TimeLimit termination
As an example, TimeLimit termination comes when the episode of "CartPole-v0"
exceeds length 200.
"""
if "TestEnvTimeLimit3-v0" not in gym.envs.registry.env_specs:
    register(
        "TestEnvTimeLimit3-v0",
        entry_point='__main__:TestEnv',
        max_episode_steps=8
    )

env = gym.make("TestEnvTimeLimit3-v0")
# equivalent to below
# env = TestEnv()
# env = gym.wrappers.TimeLimit(env, max_episode_steps=8)
print(f"env is {env}")
print(f"env's timelimit is {env._max_episode_steps}")
for i in range(2):
    all_obs = [env.reset()]
    while True:
        obs, reward, done, info = env.step(env.action_space.sample())
        all_obs += [obs]
        if done:
            print(f"all observation in episode {i}:")
            print(all_obs)
            print("TimeLimit termination")
            print()
            break
print("=========")

env is <TimeLimit<OrderEnforcing<TestEnv<TestEnvTimeLimit3-v0>>>>
env's timelimit is 8
all observation in episode 0:
[array([0.]), array([1.]), array([2.]), array([3.]), array([4.]), array([5.]), array([6.]), array([7.]), array([8.])]
TimeLimit termination

all observation in episode 1:
[array([0.]), array([1.]), array([2.]), array([3.]), array([4.]), array([5.]), array([6.]), array([7.]), array([8.])]
TimeLimit termination



In [3]:
from gym.vector import SyncVectorEnv
"""
Third kind of termination: early termination induced by fixed 
trajectory length of `n_steps`
This is usually combined with TimeLimit wrapped env,
but you can use it without the TimeLimit
"""
n_steps = 5
envs = SyncVectorEnv([
    lambda: gym.make("TestEnvTimeLimit3-v0")])
print(f"envs is {envs}")
print(f"envs' timelimit is {envs.envs[0]._max_episode_steps}")
obss = envs.reset()
for i in range(3):
    all_obss = []
    all_rwds = []
    all_done = []
    for j in range(n_steps):
        all_obss += [obss.astype("float")]
        obss, rewards, dones, infos = envs.step(np.array([1., 1.]))
        all_rwds.append(rewards)
        all_done.append(dones)
        # print(infos)

    print(f"all observation in trajectory {i}:")
    print(all_obss)
    print("early termination by `n_steps`")
    print()
    print(all_rwds)
    print(all_done)
print("=========")


envs is <SyncVectorEnv instance>
envs' timelimit is 8
all observation in trajectory 0:
[array([[0.]]), array([[1.]]), array([[2.]]), array([[3.]]), array([[4.]])]
early termination by `n_steps`

[array([1.]), array([1.]), array([1.]), array([1.]), array([1.])]
[array([False]), array([False]), array([False]), array([False]), array([False])]
all observation in trajectory 1:
[array([[5.]]), array([[6.]]), array([[7.]]), array([[0.]]), array([[1.]])]
early termination by `n_steps`

[array([1.]), array([1.]), array([0.]), array([1.]), array([1.])]
[array([False]), array([False]), array([ True]), array([False]), array([False])]
all observation in trajectory 2:
[array([[2.]]), array([[3.]]), array([[4.]]), array([[5.]]), array([[6.]])]
early termination by `n_steps`

[array([1.]), array([1.]), array([1.]), array([1.]), array([1.])]
[array([False]), array([False]), array([False]), array([False]), array([False])]
