In [4]:
import torch as t
import os 
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [7]:
env =gym.make('CartPole-v0')

In [3]:
episodes=5  #loop the environment 5 times
for episode in range(1,episodes+1):
    state = env.reset() # set init the environment,state, get the observation 
    done = False
    score =0

    while not done:
       env.render()
       action = env.action_space.sample()  # space of actions is Discrete(2)=0 or 1
       n_state,reward,done,info=env.step(action)
       score+=reward
    print('Episode:{} score{}'.format(episode,score))
env.close()

: 

: 

In [7]:
##undetstanding the environment 

In [8]:
env.action_space #two actions 0 and 1

Discrete(2)

In [9]:
env.action_space.sample()  # 0 ----push cart to the left
# 1----push cart to the right

0

In [10]:
env.observation_space   # Box(4)  4 observations: #cart position
                                                  #cart velocity
                                                  #pole angle
                                                  #pole angular velocity

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [11]:
env.observation_space.sample()

array([-1.6168649e+00,  2.1736372e+38, -1.7665635e-01,  2.1950139e+38],
      dtype=float32)

In [12]:
env.step(1)

AssertionError: Cannot call env.step() before calling reset()

In [13]:
#####training

In [14]:
log_path=os.path.join('nicho_reinforecement','log')

In [7]:
log_path

'nicho_reinforecement/log'

In [15]:
import tensorboard

env = gym.make('CartPole-v0')
env=DummyVecEnv([lambda:env])
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)


Using cpu device


In [16]:
model.learn(total_timesteps=20000)

Logging to nicho_reinforecement/log/PPO_2
-----------------------------
| time/              |      |
|    fps             | 622  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1039       |
|    iterations           | 2          |
|    time_elapsed         | 3          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00876465 |
|    clip_fraction        | 0.0729     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | 0.00169    |
|    learning_rate        | 0.0003     |
|    loss                 | 5.51       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0102    |
|    value_loss           | 52.3       |
----------------------------------------
----------

<stable_baselines3.ppo.ppo.PPO at 0x15ccb2c70>

In [17]:
#####################################save and reload model###############################################

In [18]:
PPO_Path=os.path.join('save_model','PPO_model_cartpole')  #save_model必须和project放在同一个目录下

In [19]:
model.save(PPO_Path)

In [20]:
del model

In [21]:
PPO_Path

'save_model/PPO_model_cartpole'

In [22]:
model = PPO.load(PPO_Path,env=env)

In [23]:
model.learn(total_timesteps=10000)

Logging to nicho_reinforecement/log/PPO_3
-----------------------------
| time/              |      |
|    fps             | 7528 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 4778         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0064823716 |
|    clip_fraction        | 0.0271       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.564       |
|    explained_variance   | 0.707        |
|    learning_rate        | 0.0003       |
|    loss                 | 7.28         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00617     |
|    value_loss           | 27.1         |
-----------------

<stable_baselines3.ppo.ppo.PPO at 0x115b60c70>

In [24]:
#############################################Evaluation######################################

In [25]:
evaluate_policy(model,env,n_eval_episodes=10,render=False)  #render=False 就不会展示图像视频



(200.0, 0.0)

In [26]:
env.close()

In [None]:
############################### Test Model ###################################

In [27]:
episodes=5
for episode in range(1,episodes+1):
    obs = env.reset()
    done = False
    socre=0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs,reward,done,info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

: 

: 

In [1]:
############################ Adding a callback to training Stage#########################
training_log_path=os.path.join('nicho_reinforecement','log','OPP_2')

In [2]:
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold

In [5]:
save_path = os.path.join('save_model')

In [8]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback = EvalCallback(env,callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [16]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=training_log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [17]:
model.learn(total_timesteps=20000,callback=eval_callback)



Logging to nicho_reinforecement/log/OPP_2/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.7     |
|    ep_rew_mean     | 23.7     |
| time/              |          |
|    fps             | 600      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.7        |
|    ep_rew_mean          | 28.7        |
| time/                   |             |
|    fps                  | 1019        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008605739 |
|    clip_fraction        | 0.1         |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance 



<stable_baselines3.ppo.ppo.PPO at 0x153149e50>

In [18]:
#####################Changing Polices######################

In [19]:
net_arch=[dict(pi=[128,128,128,128],vf=[128,128,128,128])]
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=training_log_path,policy_kwargs={'net_arch':net_arch})

#net_arch 自己设计的神经网络
#PPO 使用多项神经元MlpPoliy 然后导入net_arch

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [20]:
model.learn(total_timesteps=200000,callback=eval_callback)

Logging to nicho_reinforecement/log/OPP_2/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.8     |
|    ep_rew_mean     | 21.8     |
| time/              |          |
|    fps             | 5957     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 28.6         |
|    ep_rew_mean          | 28.6         |
| time/                   |              |
|    fps                  | 3158         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0138332285 |
|    clip_fraction        | 0.18         |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.682       |
|    expla



-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 90.4        |
|    ep_rew_mean          | 90.4        |
| time/                   |             |
|    fps                  | 2409        |
|    iterations           | 6           |
|    time_elapsed         | 5           |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.015515177 |
|    clip_fraction        | 0.0905      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.549      |
|    explained_variance   | 0.798       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.71        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.00394    |
|    value_loss           | 14.9        |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 108 

<stable_baselines3.ppo.ppo.PPO at 0x29e7a30d0>

In [21]:
#########################################Using an Alternate Algorithm########################

In [23]:
from stable_baselines3 import DQN

In [24]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=training_log_path)
model.learn(total_timesteps=20000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to nicho_reinforecement/log/OPP_2/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 16.5     |
|    ep_rew_mean      | 16.5     |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8962     |
|    time_elapsed     | 0        |
|    total_timesteps  | 66       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.6     |
|    ep_rew_mean      | 19.6     |
|    exploration_rate | 0.925    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12823    |
|    time_elapsed     | 0        |
|    total_timesteps  | 157      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean  

<stable_baselines3.dqn.dqn.DQN at 0x2a00e64c0>