In [None]:
# Allows notebook to reload, ensuring any imports are up-to-date
# More useful for developer rather than a third-party viewing notebook
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [1]:
# Import of custom gym.Env
from gym_env.envs.green_apple_playground import GreenAppleEnv

# Import checker to ensure environment is suitable for StableBaselines usage
from stable_baselines3.common.env_checker import check_env

# Import reinforcement learning algorithm library
from stable_baselines3 import A2C, DDPG, SAC, TD3, PPO

# Get our methods
from gym_env.envs.utils import (
    train_PPO,
    train_DDPG,
    train_SAC,
    train_TD3,
    train_A2C,
    random_actions,
    testing_model,
)

# Get our preset variables
from gym_env.envs.utils import modelsdir, logsdir, gifsdir
from gym_env.envs.utils import green_apple_playground

# Needed for creating new directories
import os
from pathlib import Path

os.makedirs(modelsdir, exist_ok=True)
os.makedirs(logsdir, exist_ok=True)
os.makedirs(gifsdir, exist_ok=True)

# Load and initialize environments
# Green Apples invert the y axis
env = GreenAppleEnv()
env_name = green_apple_playground



In [2]:
# Testing the environment
# Code taken from: https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html
# If no errors occur, our environment is suitable for usage
env.reset()
check_env(env)

In [3]:
env.reset()
print(f"{env_name} sample action:", env.action_space.sample())
print(f"{env_name} action space shape:", env.action_space.shape)
print(f"{env_name} sample observation:", env.observation_space.sample())
print(f"{env_name} observation space shape:", env.observation_space.shape)

green_apple_playground sample action: [ 0.7969086  -0.15171923 -0.0809028 ]
green_apple_playground action space shape: (3,)
green_apple_playground sample observation: [0.8467855  0.99010766 0.06401986 0.9314538  0.49777302 0.5587956
 0.61117023 0.36120635 0.39683416 0.02156536 0.24713382 0.03789672
 0.34801918 0.02120407 0.2303817  0.09931921 0.35675505 0.04152986
 0.0686816  0.4312153  0.10031384 0.2732049  0.22304106 0.29729417
 0.5991394  0.9078755  0.7841645  0.84801036 0.67610955 0.3069813
 0.874093   0.02643669 0.8651261  0.91771495 0.8908042  0.2372292
 0.7629882  0.02128404 0.67636955 0.29807216 0.7547855  0.05507567
 0.4737117  0.54431415 0.510617   0.53165495 0.41365087 0.2679112
 0.76350653 0.14611872 0.8378906  0.8812243  0.5330817  0.43403167
 0.4640271  0.6069836  0.20593488 0.4110444  0.6855476  0.7937395
 0.5603176  0.92220515 0.1018492  0.6156511  0.375288   0.61153626
 0.3166644  0.16607358 0.72060657 0.19242594 0.3505813  0.37148178
 0.74538016 0.13571954 0.5380574  

In [4]:
# Runs environment with random actions
random_actions(env, env_name)

TimeStep: 1, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 2, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 3, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 4, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 5, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 6, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 7, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 8, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 9, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 10, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 11, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 12, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 13, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 14, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 15, Reward: -0.01, Done: False, Message: {}, Apples: 4
TimeStep: 16, Reward: -0.01, Done:

![red_apple_playground_random.gif](gifs/green_apple_playground_random.gif)

Testing and training various reinforcement learning models

In [5]:
# PPO

# Train the model
model_name, model_dir = train_PPO(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = PPO.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 10       |
| time/              |          |
|    fps             | 265      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | 10           |
| time/                   |              |
|    fps                  | 248          |
|    iterations           | 2            |
|    time_elapsed         | 16           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0098722335 |
|    clip_fraction        | 0.112        |
|    clip_range          

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Episode 1, TimeStep: 48, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 49, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 50, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 51, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 52, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 53, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 54, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 55, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 56, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 57, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 58, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 59, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, TimeStep: 60, Reward: -0.01, Done: False, Message: {}, Apples: 4
Episode 1, T

Episode 1
![](gifs/PPO_V1/green_apple_playground_ep_1.gif)
Episode 2
![](gifs/PPO_V1/green_apple_playground_ep_2.gif)
Episode 3
![](gifs/PPO_V1/green_apple_playground_ep_3.gif)
Episode 4
![](gifs/PPO_V1/green_apple_playground_ep_4.gif)
Episode 5
![](gifs/PPO_V1/green_apple_playground_ep_5.gif)
Episode 6
![](gifs/PPO_V1/green_apple_playground_ep_6.gif)
Episode 7
![](gifs/PPO_V1/green_apple_playground_ep_7.gif)
Episode 8
![](gifs/PPO_V1/green_apple_playground_ep_8.gif)
Episode 9
![](gifs/PPO_V1/green_apple_playground_ep_9.gif)

In [6]:
# A2C

# Train the model
model_name, model_dir = train_A2C(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = A2C.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\A2C_V1_0
------------------------------------
| time/                 |          |
|    fps                | 273      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.26    |
|    explained_variance | -10.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.00365 |
|    std                | 1        |
|    value_loss         | 2.94e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 10       |
| time/                 |          |
|    fps                | 246      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps    | 1000

Episode 1
![](gifs/A2C_V1/green_apple_playground_ep_1.gif)
Episode 2
![](gifs/A2C_V1/green_apple_playground_ep_2.gif)
Episode 3
![](gifs/A2C_V1/green_apple_playground_ep_3.gif)
Episode 4
![](gifs/A2C_V1/green_apple_playground_ep_4.gif)
Episode 5
![](gifs/A2C_V1/green_apple_playground_ep_5.gif)
Episode 6
![](gifs/A2C_V1/green_apple_playground_ep_6.gif)
Episode 7
![](gifs/A2C_V1/green_apple_playground_ep_7.gif)
Episode 8
![](gifs/A2C_V1/green_apple_playground_ep_8.gif)
Episode 9
![](gifs/A2C_V1/green_apple_playground_ep_9.gif)

In [7]:
# DDPG

# Train the model
model_name, model_dir = train_DDPG(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = DDPG.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\DDPG_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 5        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 68       |
|    time_elapsed    | 58       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -0.101   |
|    critic_loss     | 0.0571   |
|    learning_rate   | 0.001    |
|    n_updates       | 3000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 7.5      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 59       |
|    time_elapsed    | 135      |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -1.83  

Episode 1
![](gifs/DDPG_V1/green_apple_playground_ep_1.gif)
Episode 2
![](gifs/DDPG_V1/green_apple_playground_ep_2.gif)
Episode 3
![](gifs/DDPG_V1/green_apple_playground_ep_3.gif)
Episode 4
![](gifs/DDPG_V1/green_apple_playground_ep_4.gif)
Episode 5
![](gifs/DDPG_V1/green_apple_playground_ep_5.gif)
Episode 6
![](gifs/DDPG_V1/green_apple_playground_ep_6.gif)
Episode 7
![](gifs/DDPG_V1/green_apple_playground_ep_7.gif)
Episode 8
![](gifs/DDPG_V1/green_apple_playground_ep_8.gif)
Episode 9
![](gifs/DDPG_V1/green_apple_playground_ep_9.gif)

In [8]:
# SAC

# Train the model
model_name, model_dir = train_SAC(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = SAC.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\SAC_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 7.5      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 28       |
|    time_elapsed    | 138      |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -19.1    |
|    critic_loss     | 0.107    |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -5.82    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 6.25     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 28       |
|    time_elapsed    | 280      |
|    total_timesteps | 8000    

Episode 1
![](gifs/SAC_V1/green_apple_playground_ep_1.gif)
Episode 2
![](gifs/SAC_V1/green_apple_playground_ep_2.gif)
Episode 3
![](gifs/SAC_V1/green_apple_playground_ep_3.gif)
Episode 4
![](gifs/SAC_V1/green_apple_playground_ep_4.gif)
Episode 5
![](gifs/SAC_V1/green_apple_playground_ep_5.gif)
Episode 6
![](gifs/SAC_V1/green_apple_playground_ep_6.gif)
Episode 7
![](gifs/SAC_V1/green_apple_playground_ep_7.gif)
Episode 8
![](gifs/SAC_V1/green_apple_playground_ep_8.gif)
Episode 9
![](gifs/SAC_V1/green_apple_playground_ep_9.gif)

In [9]:
# TD3

# Train the model
model_name, model_dir = train_TD3(env, env_name)

# Load the trained model
model_path = Path(f"{model_dir}/{env_name}.zip")
model = TD3.load(path=model_path, env=env, print_system_info=True)

# Testing the model
testing_model(env, model, model_name, env_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\TD3_V1_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 10       |
| time/              |          |
|    episodes        | 4        |
|    fps             | 64       |
|    time_elapsed    | 61       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -0.148   |
|    critic_loss     | 0.48     |
|    learning_rate   | 0.001    |
|    n_updates       | 3000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 10       |
| time/              |          |
|    episodes        | 8        |
|    fps             | 56       |
|    time_elapsed    | 141      |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -0.0685 

Episode 1
![](gifs/TD3_V1/green_apple_playground_ep_1.gif)
Episode 2
![](gifs/TD3_V1/green_apple_playground_ep_2.gif)
Episode 3
![](gifs/TD3_V1/green_apple_playground_ep_3.gif)
Episode 4
![](gifs/TD3_V1/green_apple_playground_ep_4.gif)
Episode 5
![](gifs/TD3_V1/green_apple_playground_ep_5.gif)
Episode 6
![](gifs/TD3_V1/green_apple_playground_ep_6.gif)
Episode 7
![](gifs/TD3_V1/green_apple_playground_ep_7.gif)
Episode 8
![](gifs/TD3_V1/green_apple_playground_ep_8.gif)
Episode 9
![](gifs/TD3_V1/green_apple_playground_ep_9.gif)