In [2]:
# Import of custom gym.Env
from gym_env.envs.pertubation_world import PerturbationEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Checks our policy and returns information about it
from stable_baselines3.common.evaluation import evaluate_policy

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, HER, SAC, TD3, PPO

# Create playground
from resources.create_playground import createPlayground

playground2 = createPlayground(
    (True, True),
    [
        [(-100, 30), (True, True)],
        [(100, 10), (False, True)],
        [(100, 100), (False, False)],
        [(-100, -100), (True, False)],
    ],
)

# Load and initialize environment
env = PerturbationEnv(playground2)

In [3]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [None]:
# Examples of action and observation
env.reset()
print("sample action:", env.action_space.sample())
print("sample space shape:", env.action_space.shape)
print("observation space shape:", env.observation_space.shape)
print("sample observation:", env.observation_space.sample())

In [None]:
# Example of agent randomly interacting in environment until it is considered as DONE (or more likely, until the time limit has been reached)
# env.render() plots the playground as an image per step
# We can concatenate the image per step and store into one
obs = env.reset()
done = False
while not done:
    obs, reward, done, msg = env.step(env.action_space.sample())
    print(
        f"TimeStep: {env.playground.timestep}, Observation: {obs}, Reward: {reward}, Done: {done}, Message: {msg}"
    )
    env.render()
print(f"Completed in {env.playground.timestep} timesteps!")
env.save_images("random")

# A png containing each step of the environment will be available in IN3007/gym_env/results/pngs/random.png
# Each step image will correspond to the information written from the output containing the timestep, observation of the agent, current reward the agent has obtained, if the environment is considered done, and any messages sent from the agent

In [None]:
# Training a model, and saving the model

# Paths for models and logs folder, for convenience
models_dir = "models/PPO"
logdir = "logs"

model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir)
for i in range(1, 30):
    model.learn(
        total_timesteps=env.playground.time_limit,
        reset_num_timesteps=False,
        tb_log_name="PPO",
    )
    model.save(f"{models_dir}/{env.playground.time_limit*i}")
del model

In [None]:
# Load the trained model
model_path = f"{models_dir}/9000.zip"
model = PPO.load(path=model_path, env=env, print_system_info=True)

In [None]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean Reward: {mean_reward}, std_reward: {std_reward}")

In [None]:
# Testing the model

results = []
for i in range(1, 10):
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
    results.append(reward)
    env.save_images(f"PPO_V0_{i}")
print(results)

# # Enjoy trained agent
# episodes = 10
#
# for ep in range(episodes):
#     obs = env.reset()
#     done = False
#     while not done:
#         env.render()
#         action, _states = model.predict(obs)
#         obs, reward, done, info = env.step(action)
#         env.save_images(f"PPO_V0_ep{episodes}")

# obs = vec_env.reset()
# episodes = 1000
# for i in range(episodes):
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render()
#     #env.save_images(f"trainV0{episodes}")

In [None]:
# A2C

# Paths for models and logs folder, for convenience
models_dir = "models/A2C"

model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=logdir)

TIME_STEPS = 10000
for i in range(1, 10):
    model.learn(
        total_timesteps=TIME_STEPS, reset_num_timesteps=False, tb_log_name="A2C"
    )
    model.save(f"{models_dir}/{TIME_STEPS * i}")

In [None]:
env.close()