In [1]:
import os
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common.vec_env import DummyVecEnv
from utils.ppo import PPO
from utils.models import Policy, CNNPolicy

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
LOGS = os.getcwd()

In [4]:
def makedirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [6]:
env_name = 'CarRacing-v0'
run_id = 1
n_steps = 250
total_timesteps = 10000000
cnn_policy = True

LOGS = os.path.join(LOGS, env_name, 'run{}'.format(run_id))
makedirs(LOGS)
tb_log = os.path.join(LOGS, 'tb')
makedirs(tb_log)
model_dir = os.path.join(LOGS, 'models')
makedirs(model_dir)
final_model_dir = os.path.join(LOGS, 'model')
n_cpu = 4

In [None]:
env = SubprocVecEnv([lambda: gym.make(env_name) for i in range(n_cpu)])

if cnn_policy:
    print('Using CNN policy network')
    model = PPO(CNNPolicy, env, n_steps=n_steps, tensorboard_log=tb_log, verbose=1, full_tensorboard_log=True)
else:
    print('Using MLP policy network')
    model = PPO(Policy, env, n_steps=n_steps, tensorboard_log=tb_log, verbose=1, full_tensorboard_log=True)
model.learn(total_timesteps, env, save_file=os.path.join(model_dir, 'model'))
model.save(final_model_dir)
del model # remove to demonstrate saving and loading

Using CNN policy network
INFO:tensorflow:Summary name model/c1/w:0 is illegal; using model/c1/w_0 instead.
INFO:tensorflow:Summary name model/c1/b:0 is illegal; using model/c1/b_0 instead.
INFO:tensorflow:Summary name model/c2/w:0 is illegal; using model/c2/w_0 instead.
INFO:tensorflow:Summary name model/c2/b:0 is illegal; using model/c2/b_0 instead.
INFO:tensorflow:Summary name model/c3/w:0 is illegal; using model/c3/w_0 instead.
INFO:tensorflow:Summary name model/c3/b:0 is illegal; using model/c3/b_0 instead.
INFO:tensorflow:Summary name model/fc1/w:0 is illegal; using model/fc1/w_0 instead.
INFO:tensorflow:Summary name model/fc1/b:0 is illegal; using model/fc1/b_0 instead.
INFO:tensorflow:Summary name model/vf/w:0 is illegal; using model/vf/w_0 instead.
INFO:tensorflow:Summary name model/vf/b:0 is illegal; using model/vf/b_0 instead.
INFO:tensorflow:Summary name model/pi/w:0 is illegal; using model/pi/w_0 instead.
INFO:tensorflow:Summary name model/pi/b:0 is illegal; using model/pi/

--------------------------------------
| approxkl           | 0.007066193   |
| clipfrac           | 0.09600001    |
| explained_variance | 0.102         |
| fps                | 93            |
| nupdates           | 12            |
| policy_entropy     | 4.2492347     |
| policy_loss        | -0.0030083596 |
| serial_timesteps   | 3000          |
| time_elapsed       | 121           |
| total_timesteps    | 12096         |
| value_loss         | 0.6276586     |
--------------------------------------
--------------------------------------
| approxkl           | 0.005414115   |
| clipfrac           | 0.07375       |
| explained_variance | 0.0725        |
| fps                | 94            |
| nupdates           | 13            |
| policy_entropy     | 4.247184      |
| policy_loss        | -0.0027425003 |
| serial_timesteps   | 3250          |
| time_elapsed       | 132           |
| total_timesteps    | 13104         |
| value_loss         | 0.64136213    |
-------------------------

In [None]:
from stable_baselines.common.policies import FeedForwardPolicy

In [None]:
model = PPO2.load(final_model_dir)

# Enjoy trained agent
env = gym.make(env_name)
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()