In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import pyglet

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
environment_name = "CartPole-v0"
env = gym.make(environment_name)

In [15]:
episodes = 1
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode, score))
#env.close()

Episode:1 Score:17.0


In [15]:
env.close()

In [24]:
env.action_space
#0: push left, 1: push right

Discrete(2)

In [22]:
env.observation_space
#cart position, cart velocity, pole angle, pole angular velocity

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [3]:
log_path = os.path.join("CartPole Training", "Logs")
#creating the path for agents to be saves

In [4]:
environment_name = "CartPole-v0"
#holds environment name
env = gym.make(environment_name)
#initialize the environment
env = DummyVecEnv([lambda: env])
"""wrap the environment in a dummy vectorized environment, allowing multiple sub-environments to be run together
these environments are fed vectors of data (actions, observations, rewards) pertaiing to each environment"""
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
#initialize the model under algorithm PPO

Using cuda device


In [6]:
model.learn(total_timesteps=20000)
#training the agent for 20,000 timesteps

Logging to CartPole Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 176  |
|    iterations      | 1    |
|    time_elapsed    | 11   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 191         |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011265738 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | 0.00263     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0176     |
|    value_loss           | 52          |
------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1c6f802ded0>

In [7]:
PPO_Path = os.path.join('CartPole Training', 'Saved Models', 'PPO_Model_CartPole')
#creating the path to save the model

In [10]:
model.save(PPO_Path)
#saves the model to the given path
#path seems to stem from folder with jup notebook in it



In [5]:
del model
#deleting model to simulate deployment

In [8]:
model = PPO.load(PPO_Path, env=env)
#loading model from path

Evaluaton:

In [14]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
#evaluating the model for 10 episodes



(200.0, 0.0)

Testing

In [9]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #model is used here
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode, score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [None]:
env.reset()
#returns observations
model.predict(obs)
#returns action and next state based on policy and observations, only action is useful in this case
env.step(action)
#returns the state after taking action and reward

In [10]:
env.close()

In [20]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [24]:
training_log_path

'CartPole Training\\Logs\\PPO_1'

In [28]:
!tensorboard --logdir={training_log_path}
#an ! is a "magic command"
#allows using command line commands in a notebook

TensorFlow installation not found - running with reduced feature set.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]
                   [--grpc_data_provider PORT] [--purge_orphaned_data BOOL]
                   [--db URI] [--db_import] [--inspect] [--version_tb]
                   [--tag TAG] [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
                   [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_data TYPE]
                   [--samples_per_plugin SAMPLES_PER_PLUGIN]
                   [--de

In [None]:
#above didn't work
%tensorboard
#clicking 'Launch TensorBoard Session' and manually navigating directories worked