In [None]:
# Allows notebook to reload, ensuring any imports are up-to-date
# More useful for developer rather than a third-party viewing notebook
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [1]:
# Import of custom gym.Env
from gym_env.envs.black_apple_playground import BlackAppleEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, SAC, TD3, PPO

# Get our methods
from gym_env.envs.utils import (
    train_PPO,
    train_DDPG,
    train_SAC,
    train_TD3,
    train_A2C,
    random_actions,
    testing_model,
)

# Get our preset variables
from gym_env.envs.utils import modelsdir, logsdir, gifsdir
from gym_env.envs.utils import black_apple_playground

# Needed for creating new directories
import os
from pathlib import Path

os.makedirs(modelsdir, exist_ok=True)
os.makedirs(logsdir, exist_ok=True)
os.makedirs(gifsdir, exist_ok=True)

# Load and initialize environments
# Black Apples randomly invert Agent
env = BlackAppleEnv()
env_name = black_apple_playground



In [2]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [3]:
env.reset()
print(f"{env_name} sample action:", env.action_space.sample())
print(f"{env_name} action space shape:", env.action_space.shape)
print(f"{env_name} sample observation:", env.observation_space.sample())
print(f"{env_name} observation space shape:", env.observation_space.shape)

black_apple_playground sample action: [-0.97395873  0.6316599  -0.50202   ]
black_apple_playground action space shape: (3,)
black_apple_playground sample observation: [0.74975455 0.06971741 0.2853258  0.1327974  0.2655109  0.3710575
 0.5507985  0.863028   0.6524585  0.9486994  0.8313275  0.1875432
 0.3272766  0.3347344  0.31540453 0.5606226  0.411577   0.14606524
 0.16441329 0.44208494 0.44310302 0.9249055  0.9064803  0.47725424
 0.47815618 0.9943988  0.5640843  0.6428367  0.08770438 0.8914619
 0.05152577 0.62003326 0.9364384  0.29500282 0.49512067 0.08504835
 0.13858671 0.2580428  0.20215996 0.25562617 0.91778773 0.60191023
 0.99700195 0.45177138 0.01351814 0.50657105 0.85072494 0.05570626
 0.3992086  0.10492894 0.91273963 0.9932732  0.37445787 0.09310506
 0.25659266 0.6683925  0.04689061 0.3521873  0.85957825 0.9542614
 0.68718064 0.77463937 0.85215205 0.29110175 0.01624168 0.35589543
 0.98287827 0.00244682 0.93710434 0.92096883 0.5228405  0.9896887
 0.33324635 0.56772196 0.24615186 

In [4]:
# Runs environment with random actions
random_actions(env, env_name)

TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 13, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 14, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 15, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 16, Reward: -0.01, Done:

![](gifs/black_apple_playground_random.gif)

Testing and training various reinforcement learning models

In [5]:
# PPO

# Train the model
model_name, model_dir = train_PPO(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = PPO.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 15       |
| time/              |          |
|    fps             | 191      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 12.5        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006939222 |
|    clip_fraction        | 0.06        |
|    clip_range           | 0.2      

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Episode 1, TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 13, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 

Episode 1
![](gifs/PPO_V1/black_apple_playground_ep_1.gif)
Episode 2
![](gifs/PPO_V1/black_apple_playground_ep_2.gif)
Episode 3
![](gifs/PPO_V1/black_apple_playground_ep_3.gif)
Episode 4
![](gifs/PPO_V1/black_apple_playground_ep_4.gif)
Episode 5
![](gifs/PPO_V1/black_apple_playground_ep_5.gif)
Episode 6
![](gifs/PPO_V1/black_apple_playground_ep_6.gif)
Episode 7
![](gifs/PPO_V1/black_apple_playground_ep_7.gif)
Episode 8
![](gifs/PPO_V1/black_apple_playground_ep_8.gif)
Episode 9
![](gifs/PPO_V1/black_apple_playground_ep_9.gif)

In [6]:
# A2C

# Train the model
model_name, model_dir = train_A2C(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = A2C.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\A2C_V1_0
------------------------------------
| time/                 |          |
|    fps                | 300      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.25    |
|    explained_variance | -97.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0572  |
|    std                | 0.998    |
|    value_loss         | 0.000507 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 20       |
| time/                 |          |
|    fps                | 246      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps    | 1000

Episode 1
![](gifs/A2C_V1/black_apple_playground_ep_1.gif)
Episode 2
![](gifs/A2C_V1/black_apple_playground_ep_2.gif)
Episode 3
![](gifs/A2C_V1/black_apple_playground_ep_3.gif)
Episode 4
![](gifs/A2C_V1/black_apple_playground_ep_4.gif)
Episode 5
![](gifs/A2C_V1/black_apple_playground_ep_5.gif)
Episode 6
![](gifs/A2C_V1/black_apple_playground_ep_6.gif)
Episode 7
![](gifs/A2C_V1/black_apple_playground_ep_7.gif)
Episode 8
![](gifs/A2C_V1/black_apple_playground_ep_8.gif)
Episode 9
![](gifs/A2C_V1/black_apple_playground_ep_9.gif)

In [7]:
# DDPG

# Train the model
model_name, model_dir = train_DDPG(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = DDPG.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\DDPG_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 7.5      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 60       |
|    time_elapsed    | 65       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | 0.0958   |
|    critic_loss     | 0.15     |
|    learning_rate   | 0.001    |
|    n_updates       | 3000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 11.2     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 48       |
|    time_elapsed    | 166      |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -0.274 

Episode 1
![](gifs/DDPG_V1/black_apple_playground_ep_1.gif)
Episode 2
![](gifs/DDPG_V1/black_apple_playground_ep_2.gif)
Episode 3
![](gifs/DDPG_V1/black_apple_playground_ep_3.gif)
Episode 4
![](gifs/DDPG_V1/black_apple_playground_ep_4.gif)
Episode 5
![](gifs/DDPG_V1/black_apple_playground_ep_5.gif)
Episode 6
![](gifs/DDPG_V1/black_apple_playground_ep_6.gif)
Episode 7
![](gifs/DDPG_V1/black_apple_playground_ep_7.gif)
Episode 8
![](gifs/DDPG_V1/black_apple_playground_ep_8.gif)
Episode 9
![](gifs/DDPG_V1/black_apple_playground_ep_9.gif)

In [8]:
# SAC

# Train the model
model_name, model_dir = train_SAC(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = SAC.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\SAC_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 20       |
| time/              |          |
|    episodes        | 4        |
|    fps             | 26       |
|    time_elapsed    | 149      |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -20.2    |
|    critic_loss     | 1.55     |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -5.78    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 971      |
|    ep_rew_mean     | 19       |
| time/              |          |
|    episodes        | 8        |
|    fps             | 25       |
|    time_elapsed    | 301      |
|    total_timesteps | 7768    

Episode 1
![](gifs/SAC_V1/black_apple_playground_ep_1.gif)
Episode 2
![](gifs/SAC_V1/black_apple_playground_ep_2.gif)
Episode 3
![](gifs/SAC_V1/black_apple_playground_ep_3.gif)
Episode 4
![](gifs/SAC_V1/black_apple_playground_ep_4.gif)
Episode 5
![](gifs/SAC_V1/black_apple_playground_ep_5.gif)
Episode 6
![](gifs/SAC_V1/black_apple_playground_ep_6.gif)
Episode 7
![](gifs/SAC_V1/black_apple_playground_ep_7.gif)
Episode 8
![](gifs/SAC_V1/black_apple_playground_ep_8.gif)
Episode 9
![](gifs/SAC_V1/black_apple_playground_ep_9.gif)

In [9]:
# TD3

# Train the model
model_name, model_dir = train_TD3(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = TD3.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\TD3_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 12.5     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 49       |
|    time_elapsed    | 80       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -1.48    |
|    critic_loss     | 0.356    |
|    learning_rate   | 0.001    |
|    n_updates       | 3000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 13.8     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 38       |
|    time_elapsed    | 209      |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -0.973  

Episode 1
![](gifs/TD3_V1/black_apple_playground_ep_1.gif)
Episode 2
![](gifs/TD3_V1/black_apple_playground_ep_2.gif)
Episode 3
![](gifs/TD3_V1/black_apple_playground_ep_3.gif)
Episode 4
![](gifs/TD3_V1/black_apple_playground_ep_4.gif)
Episode 5
![](gifs/TD3_V1/black_apple_playground_ep_5.gif)
Episode 6
![](gifs/TD3_V1/black_apple_playground_ep_6.gif)
Episode 7
![](gifs/TD3_V1/black_apple_playground_ep_7.gif)
Episode 8
![](gifs/TD3_V1/black_apple_playground_ep_8.gif)
Episode 9
![](gifs/TD3_V1/black_apple_playground_ep_9.gif)