1. Import dependencies

In [4]:
!pip install stable-baselines3[extra]



You should consider upgrading via the 'c:\users\bruno\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [5]:
import os
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

2. Load Environment

In [6]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name)

In [42]:
episodes = 5
for episode in range(1,episodes+1):
    state = env.reset()
    terminated = False
    score = 0

    while not terminated:
        env.render()
        action = env.action_space.sample() #0 ou 1, Discrete
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
    print('Episode:{} Score: {}'.format(episode,score,reward))
#env.close() 

Episode:1 Score: 13.0
Episode:2 Score: 14.0
Episode:3 Score: 24.0
Episode:4 Score: 15.0
Episode:5 Score: 14.0


Train RL Model

In [43]:
log_path = os.path.join('Training','Logs')

In [44]:
log_path

'Training\\Logs'

In [66]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
env.close() 

Using cpu device


In [73]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_8
-----------------------------
| time/              |      |
|    fps             | 1891 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1308         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0048570796 |
|    clip_fraction        | 0.0436       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.564       |
|    explained_variance   | 0.879        |
|    learning_rate        | 0.0003       |
|    loss                 | 5.27         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00842     |
|    value_loss           | 17.6         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1c8443d3dc0>

Save and Reload Model

In [48]:
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [75]:
model.save(PPO_Path)

In [50]:
del model

In [69]:
model = PPO.load(PPO_Path,env=env)

In [76]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)

(500.0, 0.0)

In [65]:
env.close() 

In [81]:
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()
    terminated = False
    score = 0

    while not terminated:
        env.render()
        action,_ = model.predict(obs) #Using model
        obs, reward, terminated, info = env.step(action)
        score += reward
    print('Episode:{} Score: {}'.format(episode,score))
#env.close() 

Episode:1 Score: [500.]
Episode:2 Score: [500.]
Episode:3 Score: [500.]
Episode:4 Score: [500.]
Episode:5 Score: [500.]


In [82]:
obs = env.reset()

In [84]:
model.predict(obs)

(array([0], dtype=int64), None)

In [None]:
env.action_sample.sample()

In [85]:
training_log_path = os.path.join(log_path, 'PPO_8')

In [86]:
training_log_path

'Training\\Logs\\PPO_8'

ir no diretório do model  via cdm e digitar "tensorboard --logdir-.". Vai abrir um localhost:6006

In [87]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [88]:
save_path = os.path.join('Training','Saved Models')

In [92]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500,verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [93]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [94]:
model.learn(total_timesteps=20000, callback = eval_callback)

Logging to Training\Logs\PPO_10
-----------------------------
| time/              |      |
|    fps             | 1871 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1129        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008787589 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00306     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.91        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0177     |
|    value_loss           | 50.6        |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x1c844536a60>

In [96]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [97]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device




In [98]:
model.learn(total_timesteps=20000, callback = eval_callback)

Logging to Training\Logs\PPO_11
-----------------------------
| time/              |      |
|    fps             | 1013 |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 745         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012842581 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.0026     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.53        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0191     |
|    value_loss           | 18.6        |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x1c8445377f0>

In [100]:
from stable_baselines3 import DQN

In [101]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [102]:
model.learn(total_timesteps=20000, callback = eval_callback)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2768     |
|    time_elapsed     | 0        |
|    total_timesteps  | 64       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.933    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3487     |
|    time_elapsed     | 0        |
|    total_timesteps  | 140      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.903    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 3691     |
|    time_elapsed     | 0        |
|    total_timesteps  | 205      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x1c8449ae190>