# 1. Dependencies

In [1]:
!pip install stable-baselines3[extra]


Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.1.0-py3-none-any.whl (172 kB)
[K     |████████████████████████████████| 172 kB 861 kB/s eta 0:00:01
[?25hCollecting matplotlib
  Downloading matplotlib-3.4.2-cp38-cp38-manylinux1_x86_64.whl (10.3 MB)
[K     |████████████████████████████████| 10.3 MB 1.7 MB/s eta 0:00:01    |███████████▊                    | 3.8 MB 698 kB/s eta 0:00:10
[?25hCollecting cloudpickle
  Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Collecting torch>=1.4.0
  Downloading torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 36 kB/s  eta 0:00:01     |█████████▍                      | 243.1 MB 324 kB/s eta 0:30:11
[?25hCollecting gym>=0.17
  Downloading gym-0.18.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 1.3 MB/s eta 0:00:01
Collecting pandas
  Downloading pandas-1.3.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (10.6 MB)
[K   

In [2]:
import os
import gym # OpenAI gym
from stable_baselines3 import PPO # on fait le choix de prendre cet algorithme parmi (A2C,ACER,DQN,....)
from stable_baselines3.common.vec_env import DummyVecEnv # huge boost (use in the breakdown game)
from stable_baselines3.common.evaluation import evaluate_policy # easier to test our model

# 2. Load Environnement

In [3]:
environnement_name = 'CartPole-v0'
env = gym.make(environnement_name)

In [28]:
episodes = 5  # we test the cartpole environnement five times
# an episode must be seen as one full game within the environment
# same environments have a fixed episode length
    # CartPole which is 200 frames
    # others are continuous
        # Breakout play until you run out of lives
for episode in range(1,episodes+1):
    state = env.reset() # we have an initial set of observation for the environment
    done = False # is our episode is done ?
    score = 0 # compter

    while not done:
        # allow to view the graphical representation of the environment
        env.render()
        # generate a random action
        action = env.action_space.sample()
        # return the agent's observation (the next observation)
        # n_state = array of the new observation space
        # reward = 1 or 0
        # done = whether or not the episode is done -> if True we go out of the while loop
        # info is a dict
        n_state, reward, done, info = env.step(action)
        # we accumulate and save the total amount of reward
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: 19.0
Episode: 2 Score: 31.0
Episode: 3 Score: 10.0
Episode: 4 Score: 34.0
Episode: 5 Score: 67.0


In [24]:
env.close()

In [21]:
print(env.reset())

[ 0.04144566 -0.03622807  0.01082143 -0.00254106]


In [19]:
print(env.action_space)
# if we drop sample we obtain Discrete(2), in fact we can only have 0 or 1 -> our action space
print(env.observation_space)
# we obtain a Box form : (lowbound, upperbound, nb of comma = nb of values in the observation space (env.reset()), type of the numbers

Discrete(2)
Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)


In [22]:
env.step(1)
# the length of an episode is a how many step there are

(array([ 0.0407211 ,  0.15873703,  0.0107706 , -0.29179014]), 1.0, False, {})

### Env.Functions
The main environnement fucntions are :
* env.reset() - reset the environment and obtain initial observations
* env.render() - visualise the environment
* env.step() - apply an action to the environment
* env.close() - close down the render frame


### Understanding The Environnement

https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [32]:
env.action_space

Discrete(2)

In [31]:
# 0 -> push cart to left,
# 1 -> push cart to the right
env.action_space.sample()

0

In [33]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [30]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([ 3.8455968e+00,  2.7743742e+38, -1.9475874e-01,  2.5730391e+38],
      dtype=float32)

# 3. Traning

stable_baselines focus on
    Model-Free RL
        Policy optimization : PPO, Policy Gradient, A2C...
        Q-learning : DQN
stable_baselines not focus on
    Model-Based RL which try to predict the futur state
        AlphaZero

There are a number of algorithms available through Stable baselines
Each algorithms will perform better for certain environments based on action space

Here existing action space :
    Box
    Discrete
    MultiDiscrete
    MultiBinary
    Multi Processing

In [34]:
# becareful we have to create Training directory in which we create Logs directory and Saved Models directory
log_path =  os.path.join('Training','Logs')


In [35]:
log_path

'Training/Logs'

In [38]:
env = gym.make(environnement_name)
# we wrap our en in DummyVecEnv
env = DummyVecEnv([lambda :env])
model = PPO(policy="MlpPolicy",env=env,verbose=1,tensorboard_log=log_path)
# we have to think of an agent's policy as the rule which tells it how to operate in the environnement
# stable baselines 3 has three policy types : MlpPolicy, CnnPolicy and MultiInputPolicy


Using cpu device


In [40]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 2686 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1890        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009471068 |
|    clip_fraction        | 0.0908      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.567      |
|    explained_variance   | 0.694       |
|    learning_rate        | 0.0003      |
|    loss                 | 22.8        |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00969    |
|    value_loss           | 72.7        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7f00f22a4d90>

# 4. Save and Reload Model

In [41]:
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [42]:
model.save(PPO_Path)

In [44]:
PPO_Path

'Training/Saved Models/PPO_Model_Cartpole'

In [45]:
del model

In [48]:
model.learn(total_timesteps=1000)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 2666 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f00865408b0>

In [47]:
model=PPO.load(PPO_Path,env=env)

# 5. Testing and Evaluation

Evaluation metrics

There are number of metrics available from the models when trained but twore core value taht you should pay attention to.
rollout/*ep_len_mean*
    on average how long a particular episode lasted before done
rollout/*ep_rew_mean*
    the average reward that the agent accumulated per episode

available with A2C algorithm but
careful with PPO algorithm we don't have these metrics
    we evaluate the model itself
    we evaluate the evaluate policy

Monitoring in Tensorboard
We are able to review evaluation, time and training metrics from within tensoboard
In order to do, so we must specify a logging directory when you initialise your model

The PPO model in this case is considered solved if we get on average a score of 200 or higher
but there is not always this kind of threshold

In [50]:
 # Evaluation : we use our import located in top of the nb
evaluate_policy(model=model,env=env,n_eval_episodes=10,render=True)




(200.0, 0.0)

In [51]:
env.close()


Good to know : CartPole Reward
Reward for CartPOle is calculated as 1 point for every step that the pole remains upright (with a max of 200 steps)
The max is 200 because CartPole is composed of 200 frames
If the pole is more than 15 degrees from vertical or the cart moves more than 2.4 units from center the episode ends
-> there is a margin of error

# 6. Test Model / Test the agent


In [58]:
episodes = 5  # we test the cartpole environnement five times
# an episode must be seen as one full game within the environment
# same environments have a fixed episode length
    # CartPole which is 200 frames (length of an episode)
    # others are continuous
        # Breakout play until you run out of lives
for episode in range(1,episodes+1):
    obs = env.reset() # we have an initial set of observation for the environment
    done = False # is our episode is done ?
    score = 0 # compter

    while not done:
        # allow to view the graphical representation of the environment
        env.render()
        # We use our agent here to determine the better action
        action, _ = model.predict(obs)
        # return the agent's observation (the next observation)
        # obs = array of the new observation space
        # reward = 1 or 0
        # done = whether or not the episode is done -> if True we go out of the while loop
        # info is a dict
        obs, reward, done, info = env.step(action)
        # we accumulate and save the total amount of reward
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: [200.]
Episode: 2 Score: [200.]
Episode: 3 Score: [200.]
Episode: 4 Score: [200.]
Episode: 5 Score: [200.]


In [57]:
env.close()


In [59]:
obs = env.reset()

In [60]:
obs

array([[-0.04191476, -0.04096119,  0.02818076, -0.0303325 ]],
      dtype=float32)

In [62]:
action, _ = model.predict(obs)
# in our case as we do not use recurrent policies model.predict only return the model's action
# it doesn't return the next state which explains "_"

In [67]:
print(action)
# array[0]  mean the model predict the action 0
print(env.action_space.sample())

[0]
0


In [68]:
# allow us to get the reward
env.step(action)
# array[1,] is a reward of 1

(array([[-0.04273398, -0.2364757 ,  0.02757411,  0.27110687]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

# 7. Viewing Logs in Tensorboard

In [71]:
training_log_path = os.path.join(log_path,'PPO_2')


In [70]:
!tensorboard --logdir={training_log_path}
# we need to stop the cell

2021-07-14 19:05:09.411248: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-14 19:05:09.411270: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.5.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


Core metrics to look at :
1. Average Reward
2. Average Episode length

Training Strategies :
1. Train for longer
2. Hyperparamater tuning ( useful package optuna)
3. Try Different algorithms

# 8. Adding a callback to the training Stage

In [73]:
# stop earlier our program before it become unstable, useful to reduce the training time


In [74]:
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold


In [75]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)


In [77]:
save_path = os.path.join('Training','Saved Models')

In [78]:
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [79]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)


Using cpu device


In [80]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 2624 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1875       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00939241 |
|    clip_fraction        | 0.0867     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | 0.00546    |
|    learning_rate        | 0.0003     |
|    loss                 | 7.47       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0121    |
|    value_loss           | 54.5       |
----------------------------------------
---------------------



<stable_baselines3.ppo.ppo.PPO at 0x7f00865437c0>

# 9. Changing Policies

In [82]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]
# new neural network hase 128 units in each 4 layers (custom actor)
# same for value focntion (vf)

Netword Arch
    This is an example of specifying a different architecture for the different neural networks used in PPO
You can also simplify this and use:
    new_arch = [128,128] to use the same for both

In [83]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})



Using cpu device


In [84]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training/Logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 1919 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1347        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014819702 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.0103     |
|    learning_rate        | 0.0003      |
|    loss                 | 2.98        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0227     |
|    value_loss           | 22.1        |
-----------------------------------------
---



<stable_baselines3.ppo.ppo.PPO at 0x7f0085d17bb0>

# 10. Using an Alternative Algorithm

In [85]:
from stable_baselines3 import DQN

Using cpu device


In [86]:
model = DQN('MlpPolicy', env, verbose=1,tensorboard_log=log_path)


Using cpu device


In [87]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration rate | 0.967    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8310     |
|    time_elapsed     | 0        |
|    total timesteps  | 69       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.915    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10525    |
|    time_elapsed     | 0        |
|    total timesteps  | 179      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.861    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 10892    |
|    time_elapsed     | 0        |
|    total timesteps  | 292      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x7f009beb5700>