In [1]:
# Allows notebook to reload, ensuring any imports are up-to-date
# More useful for developer rather than a third-party viewing notebook
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [2]:
# Import of custom gym.Env
from gym_env.envs.blue_apple_playground import BlueAppleEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, SAC, TD3, PPO

# Get our methods
from gym_env.envs.utils import (
    train_PPO,
    train_DDPG,
    train_SAC,
    train_TD3,
    train_A2C,
    random_actions,
    testing_model,
)

# Get our preset variables
from gym_env.envs.utils import modelsdir, logsdir, gifsdir
from gym_env.envs.utils import blue_apple_playground

# Needed for creating new directories
import os
from pathlib import Path

os.makedirs(modelsdir, exist_ok=True)
os.makedirs(logsdir, exist_ok=True)
os.makedirs(gifsdir, exist_ok=True)

# Load and initialize environments
# Blue Apples invert the x axis
env = BlueAppleEnv()
env_name = blue_apple_playground



In [3]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [4]:
env.reset()
print(f"{env_name} sample action:", env.action_space.sample())
print(f"{env_name} action space shape:", env.action_space.shape)
print(f"{env_name} sample observation:", env.observation_space.sample())
print(f"{env_name} observation space shape:", env.observation_space.shape)

blue_apple_playground sample action: [-0.87062556 -0.9415967   0.19596833]
blue_apple_playground action space shape: (3,)
blue_apple_playground sample observation: [0.9045042  0.9938989  0.43489328 0.5303002  0.42501006 0.33161
 0.81223416 0.19242512 0.4944763  0.41414708 0.10498023 0.88150084
 0.9890962  0.04528832 0.70219046 0.7943092  0.0360476  0.5446068
 0.6609931  0.72389936 0.37391546 0.32626104 0.13412423 0.7865414
 0.13601209 0.5295977  0.8020779  0.7103429  0.09738936 0.46977118
 0.15417427 0.38383302 0.01021426 0.8731765  0.11343952 0.8947838
 0.03358735 0.9997053  0.54092836 0.06352758 0.75191784 0.10496931
 0.17150892 0.1320628  0.9724652  0.39319834 0.5789634  0.01804654
 0.935051   0.46828744 0.7961761  0.6245063  0.5307114  0.39358333
 0.2519851  0.40530783 0.4825628  0.0410486  0.49842215 0.60901105
 0.7294807  0.17616332 0.2592856  0.6711481  0.79313266 0.5704157
 0.16439328 0.32927814 0.02935033 0.02166775 0.0759506  0.7773809
 0.4803903  0.7226091  0.6083038  0.0057

In [5]:
# Runs environment with random actions
random_actions(env, env_name)

TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 13, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 14, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 15, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 16, Reward: -0.01, Done:

![red_apple_playground_random.gif](gifs/blue_apple_playground_random.gif)

Testing and training various reinforcement learning models

In [6]:
# PPO

# Train the model
model_name, model_dir = train_PPO(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = PPO.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_V1_blue_apple_playground_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 10       |
| time/              |          |
|    fps             | 79       |
|    iterations      | 1        |
|    time_elapsed    | 25       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 10          |
| time/                   |             |
|    fps                  | 12          |
|    iterations           | 2           |
|    time_elapsed         | 329         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008053033 |
|    clip_fraction        | 0.0727      |
|    clip_range

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Episode 1, TimeStep: 33, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 34, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 35, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 36, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 37, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 38, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 39, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 40, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 41, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 42, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 43, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 44, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 45, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, T

Episode 1
![](gifs/PPO_V1/blue_apple_playground/ep_1.gif)
Episode 2
![](gifs/PPO_V1/blue_apple_playground/ep_2.gif)
Episode 3
![](gifs/PPO_V1/blue_apple_playground/ep_3.gif)
Episode 4
![](gifs/PPO_V1/blue_apple_playground/ep_4.gif)
Episode 5
![](gifs/PPO_V1/blue_apple_playground/ep_5.gif)
Episode 6
![](gifs/PPO_V1/blue_apple_playground/ep_6.gif)
Episode 7
![](gifs/PPO_V1/blue_apple_playground/ep_7.gif)
Episode 8
![](gifs/PPO_V1/blue_apple_playground/ep_8.gif)
Episode 9
![](gifs/PPO_V1/blue_apple_playground/ep_9.gif)

In [7]:
# A2C

# Train the model
model_name, model_dir = train_A2C(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = A2C.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\A2C_V1_blue_apple_playground_0
------------------------------------
| time/                 |          |
|    fps                | 1        |
|    iterations         | 100      |
|    time_elapsed       | 404      |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.26    |
|    explained_variance | -54      |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.027   |
|    std                | 1        |
|    value_loss         | 4.96e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 20       |
| time/                 |          |
|    fps                | 2        |
|    iterations         | 200      |
|    time_elapsed       | 408      |
|    tot

Episode 1
![](gifs/A2C_V1/blue_apple_playground/ep_1.gif)
Episode 2
![](gifs/A2C_V1/blue_apple_playground/ep_2.gif)
Episode 3
![](gifs/A2C_V1/blue_apple_playground/ep_3.gif)
Episode 4
![](gifs/A2C_V1/blue_apple_playground/ep_4.gif)
Episode 5
![](gifs/A2C_V1/blue_apple_playground/ep_5.gif)
Episode 6
![](gifs/A2C_V1/blue_apple_playground/ep_6.gif)
Episode 7
![](gifs/A2C_V1/blue_apple_playground/ep_7.gif)
Episode 8
![](gifs/A2C_V1/blue_apple_playground/ep_8.gif)
Episode 9
![](gifs/A2C_V1/blue_apple_playground/ep_9.gif)

In [8]:
# DDPG

# Train the model
model_name, model_dir = train_DDPG(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = DDPG.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\DDPG_V1_blue_apple_playground_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 414      |
|    ep_rew_mean     | 28.4     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 4        |
|    time_elapsed    | 356      |
|    total_timesteps | 1657     |
| train/             |          |
|    actor_loss      | -0.108   |
|    critic_loss     | 0.543    |
|    learning_rate   | 0.001    |
|    n_updates       | 1461     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 554      |
|    ep_rew_mean     | 23.2     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 5        |
|    time_elapsed    | 865      |
|    total_timesteps | 4436     |
| train/             |          |
|    act

Episode 1
![](gifs/DDPG_V1/blue_apple_playground/ep_1.gif)
Episode 2
![](gifs/DDPG_V1/blue_apple_playground/ep_2.gif)
Episode 3
![](gifs/DDPG_V1/blue_apple_playground/ep_3.gif)
Episode 4
![](gifs/DDPG_V1/blue_apple_playground/ep_4.gif)
Episode 5
![](gifs/DDPG_V1/blue_apple_playground/ep_5.gif)
Episode 6
![](gifs/DDPG_V1/blue_apple_playground/ep_6.gif)
Episode 7
![](gifs/DDPG_V1/blue_apple_playground/ep_7.gif)
Episode 8
![](gifs/DDPG_V1/blue_apple_playground/ep_8.gif)
Episode 9
![](gifs/DDPG_V1/blue_apple_playground/ep_9.gif)

In [9]:
# SAC

# Train the model
model_name, model_dir = train_SAC(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = SAC.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\SAC_V1_blue_apple_playground_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 5        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 14       |
|    time_elapsed    | 266      |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -19.1    |
|    critic_loss     | 0.0727   |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -5.84    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 2.5      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 15       |
|    time_elapsed    | 518      |
|    tota


KeyboardInterrupt



Episode 1
![](gifs/SAC_V1/blue_apple_playground/ep_1.gif)
Episode 2
![](gifs/SAC_V1/blue_apple_playground/ep_2.gif)
Episode 3
![](gifs/SAC_V1/blue_apple_playground/ep_3.gif)
Episode 4
![](gifs/SAC_V1/blue_apple_playground/ep_4.gif)
Episode 5
![](gifs/SAC_V1/blue_apple_playground/ep_5.gif)
Episode 6
![](gifs/SAC_V1/blue_apple_playground/ep_6.gif)
Episode 7
![](gifs/SAC_V1/blue_apple_playground/ep_7.gif)
Episode 8
![](gifs/SAC_V1/blue_apple_playground/ep_8.gif)
Episode 9
![](gifs/SAC_V1/blue_apple_playground/ep_9.gif)

In [None]:
# TD3

# Train the model
model_name, model_dir = train_TD3(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = TD3.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Episode 1
![](gifs/TD3_V1/blue_apple_playground/ep_1.gif)
Episode 2
![](gifs/TD3_V1/blue_apple_playground/ep_2.gif)
Episode 3
![](gifs/TD3_V1/blue_apple_playground/ep_3.gif)
Episode 4
![](gifs/TD3_V1/blue_apple_playground/ep_4.gif)
Episode 5
![](gifs/TD3_V1/blue_apple_playground/ep_5.gif)
Episode 6
![](gifs/TD3_V1/blue_apple_playground/ep_6.gif)
Episode 7
![](gifs/TD3_V1/blue_apple_playground/ep_7.gif)
Episode 8
![](gifs/TD3_V1/blue_apple_playground/ep_8.gif)
Episode 9
![](gifs/TD3_V1/blue_apple_playground/ep_9.gif)