In [None]:
# Allows notebook to reload, ensuring any imports are up-to-date
# More useful for developer rather than a third-party viewing notebook
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [1]:
# Import of custom gym.Env
from gym_env.envs.red_apple_playground import RedAppleEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, SAC, TD3, PPO

# Get our methods
from gym_env.envs.utils import (
    train_PPO,
    train_DDPG,
    train_SAC,
    train_TD3,
    train_A2C,
    random_actions,
    testing_model,
)

# Get our preset variables
from gym_env.envs.utils import modelsdir, logsdir, gifsdir
from gym_env.envs.utils import red_apple_playground

# Needed for creating new directories
import os
from pathlib import Path

os.makedirs(modelsdir, exist_ok=True)
os.makedirs(logsdir, exist_ok=True)
os.makedirs(gifsdir, exist_ok=True)

# Load and initialize environments
# Red Apples have no effect on the Agent
env = RedAppleEnv()
env_name = red_apple_playground



In [2]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [3]:
env.reset()
print(f"{env_name} sample action:", env.action_space.sample())
print(f"{env_name} action space shape:", env.action_space.shape)
print(f"{env_name} sample observation:", env.observation_space.sample())
print(f"{env_name} observation space shape:", env.observation_space.shape)

red_apple_playground sample action: [-0.09067548 -0.99442756 -0.6426614 ]
red_apple_playground action space shape: (3,)
red_apple_playground sample observation: [0.46825406 0.18338543 0.6455542  0.14435652 0.33001363 0.8766908
 0.99069744 0.41783723 0.9933624  0.7581776  0.5027365  0.6435502
 0.8254424  0.21582314 0.21028347 0.5763731  0.64656806 0.54266524
 0.23658162 0.3732159  0.18586512 0.08812525 0.3377987  0.53645325
 0.10196034 0.9025316  0.01404372 0.55666363 0.12869155 0.8644522
 0.49279588 0.64438975 0.18788345 0.91137564 0.754948   0.37539187
 0.00656311 0.22324741 0.86807346 0.3032853  0.7977111  0.59225976
 0.6103987  0.5781567  0.45198753 0.16815087 0.92474955 0.6244981
 0.9014508  0.06761443 0.8943599  0.9508352  0.19841443 0.5350491
 0.6439179  0.6762032  0.35175842 0.44346768 0.32408923 0.4687685
 0.37215528 0.04454312 0.56144387 0.82133687 0.80694497 0.39777675
 0.20066023 0.4297626  0.39905012 0.37417755 0.26302618 0.12603015
 0.6706267  0.5860071  0.38752303 0.19424

In [4]:
# Runs environment with random actions
random_actions(env, env_name)

TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 13, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 14, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 15, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 16, Reward: -0.01, Done:

![](gifs/red_apple_playground_random.gif)

Testing and training various reinforcement learning models

In [5]:
# PPO

# Train the model
model_name, model_dir = train_PPO(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = PPO.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_V1_red_apple_playground_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 10       |
| time/              |          |
|    fps             | 161      |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 10          |
| time/                   |             |
|    fps                  | 155         |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005489124 |
|    clip_fraction        | 0.0582      |
|    clip_range 

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Episode 1, TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 13, Reward: 9.99, Done: False, Message: {}, Apples: 3
Episode 1, TimeStep: 1

Episode 1
![](gifs/PPO_V1/red_apple_playground_ep_1.gif)
Episode 2
![](gifs/PPO_V1/red_apple_playground_ep_2.gif)
Episode 3
![](gifs/PPO_V1/red_apple_playground_ep_3.gif)
Episode 4
![](gifs/PPO_V1/red_apple_playground_ep_4.gif)
Episode 5
![](gifs/PPO_V1/red_apple_playground_ep_5.gif)
Episode 6
![](gifs/PPO_V1/red_apple_playground_ep_6.gif)
Episode 7
![](gifs/PPO_V1/red_apple_playground_ep_7.gif)
Episode 8
![](gifs/PPO_V1/red_apple_playground_ep_8.gif)
Episode 9
![](gifs/PPO_V1/red_apple_playground_ep_9.gif)

In [6]:
# A2C

# Train the model
model_name, model_dir = train_A2C(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = A2C.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\A2C_V1_red_apple_playground_0
------------------------------------
| time/                 |          |
|    fps                | 235      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.26    |
|    explained_variance | -286     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.264    |
|    std                | 1        |
|    value_loss         | 0.00325  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 20       |
| time/                 |          |
|    fps                | 200      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    tota

Episode 1
![](gifs/A2C_V1/red_apple_playground_ep_1.gif)
Episode 2
![](gifs/A2C_V1/red_apple_playground_ep_2.gif)
Episode 3
![](gifs/A2C_V1/red_apple_playground_ep_3.gif)
Episode 4
![](gifs/A2C_V1/red_apple_playground_ep_4.gif)
Episode 5
![](gifs/A2C_V1/red_apple_playground_ep_5.gif)
Episode 6
![](gifs/A2C_V1/red_apple_playground_ep_6.gif)
Episode 7
![](gifs/A2C_V1/red_apple_playground_ep_7.gif)
Episode 8
![](gifs/A2C_V1/red_apple_playground_ep_8.gif)
Episode 9
![](gifs/A2C_V1/red_apple_playground_ep_9.gif)

In [7]:
# DDPG

# Train the model
model_name, model_dir = train_DDPG(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = DDPG.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\DDPG_V1_red_apple_playground_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 338      |
|    ep_rew_mean     | 31.6     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 35       |
|    time_elapsed    | 38       |
|    total_timesteps | 1353     |
| train/             |          |
|    actor_loss      | -0.523   |
|    critic_loss     | 0.717    |
|    learning_rate   | 0.001    |
|    n_updates       | 1239     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 228      |
|    ep_rew_mean     | 35.2     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 32       |
|    time_elapsed    | 56       |
|    total_timesteps | 1825     |
| train/             |          |
|    acto

Episode 1
![](gifs/DDPG_V1/red_apple_playground_ep_1.gif)
Episode 2
![](gifs/DDPG_V1/red_apple_playground_ep_2.gif)
Episode 3
![](gifs/DDPG_V1/red_apple_playground_ep_3.gif)
Episode 4
![](gifs/DDPG_V1/red_apple_playground_ep_4.gif)
Episode 5
![](gifs/DDPG_V1/red_apple_playground_ep_5.gif)
Episode 6
![](gifs/DDPG_V1/red_apple_playground_ep_6.gif)
Episode 7
![](gifs/DDPG_V1/red_apple_playground_ep_7.gif)
Episode 8
![](gifs/DDPG_V1/red_apple_playground_ep_8.gif)
Episode 9
![](gifs/DDPG_V1/red_apple_playground_ep_9.gif)

In [10]:
# SAC

# Train the model
model_name, model_dir = train_SAC(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = SAC.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 12.5     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 5        |
|    time_elapsed    | 771      |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -19.2    |
|    critic_loss     | 0.182    |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -5.77    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 11.2     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 7        |
|    time_elapsed    | 1094     |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -20.4    |
|    critic_loss     | 0.153    |
|    ent_coef 


KeyboardInterrupt



Episode 1
![](gifs/SAC_V1/red_apple_playground_ep_1.gif)
Episode 2
![](gifs/SAC_V1/red_apple_playground_ep_2.gif)
Episode 3
![](gifs/SAC_V1/red_apple_playground_ep_3.gif)
Episode 4
![](gifs/SAC_V1/red_apple_playground_ep_4.gif)
Episode 5
![](gifs/SAC_V1/red_apple_playground_ep_5.gif)
Episode 6
![](gifs/SAC_V1/red_apple_playground_ep_6.gif)
Episode 7
![](gifs/SAC_V1/red_apple_playground_ep_7.gif)
Episode 8
![](gifs/SAC_V1/red_apple_playground_ep_8.gif)
Episode 9
![](gifs/SAC_V1/red_apple_playground_ep_9.gif)

In [None]:
# TD3

# Train the model
model_name, model_dir = train_TD3(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = TD3.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 155      |
|    ep_rew_mean     | 37.6     |
| time/              |          |
|    episodes        | 48       |
|    fps             | 20       |
|    time_elapsed    | 366      |
|    total_timesteps | 7435     |
| train/             |          |
|    actor_loss      | -1.25    |
|    critic_loss     | 0.912    |
|    learning_rate   | 0.001    |
|    n_updates       | 7317     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 152      |
|    ep_rew_mean     | 37.7     |
| time/              |          |
|    episodes        | 52       |
|    fps             | 20       |
|    time_elapsed    | 390      |
|    total_timesteps | 7907     |
| train/             |          |
|    actor_loss      | -1.37    |
|    critic_loss     | 0.858    |
|    learning_rate   | 0.001    |
|    n_updates       | 7789     |
--------------

Episode 1
![](gifs/TD3_V1/red_apple_playground_ep_1.gif)
Episode 2
![](gifs/TD3_V1/red_apple_playground_ep_2.gif)
Episode 3
![](gifs/TD3_V1/red_apple_playground_ep_3.gif)
Episode 4
![](gifs/TD3_V1/red_apple_playground_ep_4.gif)
Episode 5
![](gifs/TD3_V1/red_apple_playground_ep_5.gif)
Episode 6
![](gifs/TD3_V1/red_apple_playground_ep_6.gif)
Episode 7
![](gifs/TD3_V1/red_apple_playground_ep_7.gif)
Episode 8
![](gifs/TD3_V1/red_apple_playground_ep_8.gif)
Episode 9
![](gifs/TD3_V1/red_apple_playground_ep_9.gif)