In [1]:
# Import of custom gym.Env
from gym_env.envs.perturbation_world import PerturbationEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Checks our policy and returns information about it
from stable_baselines3.common.evaluation import evaluate_policy

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, HER, SAC, TD3, PPO

# Needed for creating new directories
import os
from pathlib import Path

# Create playground
from resources.create_playground import createPlayground

# Simplest playground, no perturbation
playground1 = createPlayground(
    (True, True),
    [
        [(-100, 30), (True, True)],
        [(100, 10), (False, True)],
        [(100, 100), (False, False)],
        [(-100, -100), (True, False)],
    ],
)

# Load and initialize environment
env = PerturbationEnv(playground1)



In [2]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [3]:
# Examples of action and observation
env.reset()
print("sample action:", env.action_space.sample())
print("sample space shape:", env.action_space.shape)
print("observation space shape:", env.observation_space.shape)
print("sample observation:", env.observation_space.sample())

sample action: [-0.5249852   0.22476186 -0.5243685 ]
sample space shape: (3,)
observation space shape: (228,)
sample observation: [3.81332427e-01 7.73803353e-01 6.86693490e-01 6.93644226e-01
 3.82229924e-01 9.62894186e-02 5.77844679e-02 6.94457516e-02
 3.46816987e-01 7.19852567e-01 1.21943876e-01 4.61450815e-01
 9.05919194e-01 6.80364132e-01 7.12872520e-02 3.92785698e-01
 8.48232627e-01 8.14266026e-01 9.92008209e-01 5.75209320e-01
 6.94110990e-01 4.02314126e-01 8.60083640e-01 9.98231769e-01
 7.23545313e-01 9.23975334e-02 2.15707775e-02 6.86697960e-01
 9.22867775e-01 4.94604230e-01 7.51791775e-01 9.30615544e-01
 9.08595741e-01 9.86218214e-01 7.79154241e-01 6.24358952e-02
 2.17958689e-01 5.13981283e-01 3.59551519e-01 3.95674258e-01
 7.90712833e-01 2.54093677e-01 7.95063496e-01 4.80731368e-01
 1.68415740e-01 5.18046141e-01 2.14148730e-01 8.59943569e-01
 6.31779730e-01 5.67727447e-01 7.93249667e-01 5.44606447e-01
 1.37635887e-01 5.92610717e-01 4.53150004e-01 1.23487432e-02
 7.21616089e-01 

In [4]:
# Example of agent randomly interacting in environment until it is considered as DONE (most likely running until the time limit has been reached)
# env.render() plots the playground as an image per step
# We can concatenate the image per step and store into one
obs = env.reset()
done = False
while not done:
    # Prevents the environment from going past our predefined time_limit
    if env.playground.timestep >= env.playground.time_limit:
        done = True
        break
    else:
        obs, reward, done, msg = env.step(env.action_space.sample())
        print(
            f"TimeStep: {env.playground.timestep}, Observation: {obs}, Reward: {reward}, Done: {done}, Message: {msg}"
        )
        env.render()
print(f"Completed in {env.playground.timestep} timesteps!")
env.save_images("random")

# A png containing each step of the environment will be available in IN3007/gym_env/results/pngs/random.png
# Each step image will correspond to the information written from the output containing the timestep, observation of the agent, current reward the agent has obtained, if the environment is considered done, and any messages sent from the agent

TimeStep: 1, Observation: [4.9021730e-01 4.7624889e-01 4.7792572e-01 4.9549267e-01 5.3245938e-01
 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 1.0000000e+00 5.5975991e-01 5.1704073e-01 4.9298191e-01 4.8469695e-01
 4.9298191e-01 5.2060652e-01 5.6482840e-01 7.0441955e-01 5.9489816e-01
 5.2534050e-01 4.9176949e-01 4.7317201e-01 4.7132400e-01 4.8641637e-01
 5.1696754e-01 5.7159609e-01 6.6844070e-01 5.7298601e-01 5.1962775e-01
 4.9021730e-01 5.0196081e-01 5.0196081e-01 0.0000000e+00 5.0196081e-01
 5.0196081e-01 0.0000000e+00 5.0196081e-01 5.0196081e-01 0.0000000e+00
 5.0196081e-01 5.0196081e-01 1.4012985e-45 5.0196081e-01 5.0196081e-01
 1.4012985e-45 5.0196081e-01 5.0196081e-01 1.4012985e-45 5.0196081e-01
 5.0196081e-01 1.4012985e-45 5.0196081e-01 5.0196081e-01 2.8025969e-45
 5.0196081e-01 5.0196081e-01 2.8025969e-45 5.0196081e-01 5.0196081e-01
 2.8025969e-45 5.0196081e-01 5.0196081e-01 2.802596

In [None]:
# Training a model, and saving the model

# Create folders for storing models
models_dir = Path("models/PPO_V1")
logdir = Path("logs")

os.makedirs(models_dir, exist_ok=True)
os.makedirs(logdir, exist_ok=True)

# We split the training per 1000 timesteps so that we can choose the best point of time to load the model from
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir)
for i in range(1, 100):
    model.learn(
        total_timesteps=env.playground.time_limit,
        reset_num_timesteps=False,
        tb_log_name="PPO_V1",
    )
    model.save(Path(f"{models_dir}/{env.playground.time_limit*i}"))
del model

In [None]:
# Load the trained model
model_path = Path(f"{models_dir}/9000.zip")
model = PPO.load(path=model_path, env=env, print_system_info=True)

In [None]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean Reward: {mean_reward}, std_reward: {std_reward}")

In [None]:
# Testing the model

results = []
for episode in range(1, 10):
    obs = env.reset()
    done = False
    while not done:
        if env.playground.timestep >= env.playground.time_limit:
            done = True
            break
        else:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
        print(
            f"Test {episode}, current timestep: {env.playground.timestep} Reward: {reward}, Done: {done}"
        )
    results.append([env.playground.timestep, reward, done])
    env.save_images(f"PPO_V0_{episode}")
print(f"[Timestep, Reward, Done]")
print(results)

In [None]:
# A2C

# Paths for models and logs folder, for convenience
models_dir = "models/A2C"

model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=logdir)

TIME_STEPS = 10000
for i in range(1, 10):
    model.learn(
        total_timesteps=TIME_STEPS, reset_num_timesteps=False, tb_log_name="A2C"
    )
    model.save(f"{models_dir}/{TIME_STEPS * i}")

In [None]:
env.close()