In [1]:
# Import of custom gym.Env
from gym_env.envs.pertubation_world import PerturbationEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Import reinforcement learning algorithm library
from stable_baselines3 import PPO



In [2]:
# Load and initialize environment
env = PerturbationEnv()
env.reset()

array([0.49      , 0.49819675, 0.5255473 , 0.56938565, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.5860034 ,
       0.523259  , 0.49040797, 0.47265208, 0.47169906, 0.4876474 ,
       0.5189412 , 0.58008623, 0.6506919 , 0.56302756, 0.5107837 ,
       0.48052055, 0.47042534, 0.47518417, 0.49648765, 0.53712195,
       0.61073726, 1.        , 0.564358  , 0.52201533, 0.49648765,
       0.49      ], dtype=float32)

In [3]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
check_env(env)

In [4]:
# Examples of action and observation
print("sample action:", env.action_space.sample())

print("observation space shape:", env.observation_space.shape)
print("sample observation:", env.observation_space.sample())

sample action: [-0.53843117 -0.6841612   0.99652547]
observation space shape: (36,)
sample observation: [0.717904   0.8581264  0.9188795  0.8791235  0.42022327 0.19064082
 0.01466436 0.62208307 0.9941674  0.0017438  0.4999203  0.13427177
 0.8531085  0.6677886  0.24667516 0.8129938  0.619382   0.8563076
 0.5589326  0.07344308 0.66222304 0.36177176 0.9629579  0.96227694
 0.48808193 0.9119756  0.71448183 0.77757883 0.4101237  0.14931875
 0.5494471  0.65561557 0.24057214 0.27124223 0.05759953 0.56636405]


In [None]:
# Example of random actions (running env.render() plots the playground as an image per step)
from matplotlib import pyplot as plt
import numpy as np

env.reset()
images = []
for step in range(200):
    img = env.render()
    images.append(img)
    obs, reward, done, msg = env.step(env.action_space.sample())
    print(f"Observation: {obs}, Reward: {reward}, Done: {done}, Message: {msg}")
print(f"Completed in {env.playground.timestep} timesteps!")
plt.imsave("random.png", np.concatenate(images))

In [None]:
# Example of agent interacting in environment until completed
env.reset()
done = False
while not done:
    env.render()
    obs, reward, done, msg = env.step(env.action_space.sample())
    print(f"Observation: {obs}, Reward: {reward}, Done: {done}, Message: {msg}")
print(f"Completed in {env.playground.timestep} timesteps!")

In [14]:
# Training the model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render("rgb_array")
    # VecEnv resets automatically
    # if done:
    #   obs = env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 292      |
|    ep_rew_mean     | 10       |
| time/              |          |
|    fps             | 401      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 384         |
|    ep_rew_mean          | 10          |
| time/                   |             |
|    fps                  | 387         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004151566 |
|    clip_fraction        | 0.011       |
|    clip_range           | 0.2         |
|    entropy_loss   

AttributeError: 'PerturbationEnv' object has no attribute 'get_np_img'

In [16]:
# Save model
import os

# Create folders for storing models
models_dir = "models/PPO"
logdir = "logs"

os.makedirs(models_dir, exist_ok=True)
os.makedirs(logdir, exist_ok=True)

model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir)
TIME_STEPS = 10000
for i in range(1, 10):
    model.learn(total_timesteps=TIME_STEPS, reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIME_STEPS * i}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 657      |
|    ep_rew_mean     | 10       |
| time/              |          |
|    fps             | 450      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 507          |
|    ep_rew_mean          | 10           |
| time/                   |              |
|    fps                  | 389          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0068375045 |
|    clip_fraction        | 0.02         |
|    clip_range           | 

In [None]:
# Load model
model_path = f"{models_dir}/90000.zip"
model = PPO.load(model_path, env=env)

episodes = 10

for ep in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        env.render()
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)

In [None]:
# Final step - close environment
env.close()