# Reinforcement Learning Cartpole.

Importing Libraries.

In [1]:
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

Defining the cartpole under the environment name.

In [2]:
ENV_NAME = 'CartPole-v0'

Get the environment and extract the number of actions.

In [3]:
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

Option 1 : Simple model

In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

Model Summary

In [5]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 2)                 10        
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________
None


Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
even the metrics!

In [6]:
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()


Fitting the model.

In [7]:
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

Training for 100000 steps ...
    24/100000: episode: 1, duration: 0.235s, episode steps: 24, steps per second: 102, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.102 [-1.878, 0.796], mean_best_reward: --
    36/100000: episode: 2, duration: 0.073s, episode steps: 12, steps per second: 164, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.152 [-1.135, 2.120], mean_best_reward: --
    75/100000: episode: 3, duration: 0.248s, episode steps: 39, steps per second: 157, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.436 [0.000, 1.000], mean observation: -0.037 [-1.476, 1.596], mean_best_reward: --
   107/100000: episode: 4, duration: 0.166s, episode steps: 32, steps per second: 192, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.102 [-0.608, 1.249], mean_best_reward:

   722/100000: episode: 36, duration: 0.111s, episode steps: 20, steps per second: 180, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.650 [0.000, 1.000], mean observation: -0.088 [-2.088, 1.162], mean_best_reward: --
   759/100000: episode: 37, duration: 0.196s, episode steps: 37, steps per second: 188, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.405 [0.000, 1.000], mean observation: 0.033 [-1.544, 2.338], mean_best_reward: --
   771/100000: episode: 38, duration: 0.073s, episode steps: 12, steps per second: 164, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.121 [-3.049, 1.974], mean_best_reward: --
   785/100000: episode: 39, duration: 0.087s, episode steps: 14, steps per second: 161, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.102 [-1.734, 0.983], mean_best_reward: --
   835/100000: episod

  1552/100000: episode: 71, duration: 0.066s, episode steps: 13, steps per second: 198, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.098 [-2.379, 1.517], mean_best_reward: --
  1564/100000: episode: 72, duration: 0.068s, episode steps: 12, steps per second: 177, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.123 [-0.957, 1.779], mean_best_reward: --
  1576/100000: episode: 73, duration: 0.070s, episode steps: 12, steps per second: 173, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.101 [-1.612, 2.570], mean_best_reward: --
  1597/100000: episode: 74, duration: 0.133s, episode steps: 21, steps per second: 158, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.619 [0.000, 1.000], mean observation: -0.030 [-1.829, 1.209], mean_best_reward: --
  1636/100000: episode

  2298/100000: episode: 105, duration: 0.278s, episode steps: 59, steps per second: 212, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.091 [-0.682, 1.049], mean_best_reward: --
  2335/100000: episode: 106, duration: 0.167s, episode steps: 37, steps per second: 222, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.118 [-0.899, 0.549], mean_best_reward: --
  2374/100000: episode: 107, duration: 0.181s, episode steps: 39, steps per second: 216, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.006 [-1.354, 0.986], mean_best_reward: --
  2394/100000: episode: 108, duration: 0.092s, episode steps: 20, steps per second: 217, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.072 [-0.929, 0.632], mean_best_reward: --
  2406/100000: epi

  3533/100000: episode: 139, duration: 0.327s, episode steps: 66, steps per second: 202, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.119 [-1.145, 0.627], mean_best_reward: --
  3556/100000: episode: 140, duration: 0.104s, episode steps: 23, steps per second: 221, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: 0.050 [-0.776, 1.146], mean_best_reward: --
  3575/100000: episode: 141, duration: 0.091s, episode steps: 19, steps per second: 208, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.086 [-1.298, 0.765], mean_best_reward: --
  3595/100000: episode: 142, duration: 0.094s, episode steps: 20, steps per second: 212, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.091 [-1.471, 0.780], mean_best_reward: --
  3617/100000: ep

  4732/100000: episode: 173, duration: 0.275s, episode steps: 55, steps per second: 200, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.436 [0.000, 1.000], mean observation: -0.206 [-1.258, 0.661], mean_best_reward: --
  4768/100000: episode: 174, duration: 0.175s, episode steps: 36, steps per second: 206, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.115 [-0.446, 1.442], mean_best_reward: --
  4789/100000: episode: 175, duration: 0.110s, episode steps: 21, steps per second: 191, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.101 [-0.578, 1.304], mean_best_reward: --
  4852/100000: episode: 176, duration: 0.310s, episode steps: 63, steps per second: 204, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.190 [-1.035, 0.455], mean_best_reward: --
  4932/100000: epi

  6185/100000: episode: 208, duration: 0.111s, episode steps: 17, steps per second: 153, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.075 [-0.981, 1.458], mean_best_reward: --
  6246/100000: episode: 209, duration: 0.354s, episode steps: 61, steps per second: 172, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.083 [-0.636, 0.977], mean_best_reward: --
  6279/100000: episode: 210, duration: 0.158s, episode steps: 33, steps per second: 209, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.086 [-1.222, 0.732], mean_best_reward: --
  6322/100000: episode: 211, duration: 0.276s, episode steps: 43, steps per second: 156, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.083 [-0.621, 0.862], mean_best_reward: --
  6363/100000: epis

  7281/100000: episode: 242, duration: 0.200s, episode steps: 32, steps per second: 160, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.035 [-0.826, 1.113], mean_best_reward: --
  7307/100000: episode: 243, duration: 0.180s, episode steps: 26, steps per second: 145, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.577 [0.000, 1.000], mean observation: -0.032 [-1.751, 1.029], mean_best_reward: --
  7344/100000: episode: 244, duration: 0.336s, episode steps: 37, steps per second: 110, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.002 [-1.166, 1.226], mean_best_reward: --
  7376/100000: episode: 245, duration: 0.202s, episode steps: 32, steps per second: 159, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.406 [0.000, 1.000], mean observation: 0.064 [-1.244, 2.287], mean_best_reward: --
  7414/100000: epis

  8566/100000: episode: 277, duration: 0.106s, episode steps: 22, steps per second: 209, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.108 [-0.459, 1.295], mean_best_reward: --
  8643/100000: episode: 278, duration: 0.461s, episode steps: 77, steps per second: 167, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.168 [-0.736, 1.167], mean_best_reward: --
  8676/100000: episode: 279, duration: 0.154s, episode steps: 33, steps per second: 215, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.141 [-0.597, 1.152], mean_best_reward: --
  8692/100000: episode: 280, duration: 0.087s, episode steps: 16, steps per second: 185, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.312 [0.000, 1.000], mean observation: 0.103 [-1.168, 2.087], mean_best_reward: --
  8711/100000: episo

  9855/100000: episode: 312, duration: 0.244s, episode steps: 43, steps per second: 176, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.138 [-1.120, 0.634], mean_best_reward: --
  9869/100000: episode: 313, duration: 0.067s, episode steps: 14, steps per second: 209, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.094 [-1.674, 0.981], mean_best_reward: --
  9945/100000: episode: 314, duration: 0.398s, episode steps: 76, steps per second: 191, episode reward: 76.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.133 [-1.254, 1.118], mean_best_reward: --
  9985/100000: episode: 315, duration: 0.202s, episode steps: 40, steps per second: 198, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.052 [-0.748, 1.098], mean_best_reward: --
 10006/100000: ep

 11006/100000: episode: 346, duration: 0.270s, episode steps: 39, steps per second: 144, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.072 [-1.598, 0.656], mean_best_reward: --
 11076/100000: episode: 347, duration: 0.385s, episode steps: 70, steps per second: 182, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.124 [-0.738, 1.552], mean_best_reward: --
 11119/100000: episode: 348, duration: 0.265s, episode steps: 43, steps per second: 163, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.139 [-0.938, 0.633], mean_best_reward: --
 11156/100000: episode: 349, duration: 0.216s, episode steps: 37, steps per second: 172, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.088 [-0.550, 1.077], mean_best_reward: --
 11181/100000: epi

 12285/100000: episode: 381, duration: 0.351s, episode steps: 70, steps per second: 199, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.025 [-1.072, 0.725], mean_best_reward: --
 12351/100000: episode: 382, duration: 0.306s, episode steps: 66, steps per second: 216, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.088 [-0.970, 1.281], mean_best_reward: --
 12375/100000: episode: 383, duration: 0.125s, episode steps: 24, steps per second: 192, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.035 [-0.978, 1.308], mean_best_reward: --
 12433/100000: episode: 384, duration: 0.415s, episode steps: 58, steps per second: 140, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.534 [0.000, 1.000], mean observation: 0.133 [-0.568, 1.213], mean_best_reward: --
 12520/100000: epi

 13556/100000: episode: 416, duration: 0.259s, episode steps: 57, steps per second: 220, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.036 [-0.652, 1.156], mean_best_reward: --
 13577/100000: episode: 417, duration: 0.097s, episode steps: 21, steps per second: 217, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.126 [-0.766, 1.279], mean_best_reward: --
 13595/100000: episode: 418, duration: 0.084s, episode steps: 18, steps per second: 214, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.114 [-0.390, 1.085], mean_best_reward: --
 13613/100000: episode: 419, duration: 0.085s, episode steps: 18, steps per second: 211, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.120 [-0.541, 1.064], mean_best_reward: --
 13660/100000: episo

 14778/100000: episode: 451, duration: 0.167s, episode steps: 33, steps per second: 197, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.064 [-1.150, 0.621], mean_best_reward: 88.500000
 14804/100000: episode: 452, duration: 0.126s, episode steps: 26, steps per second: 206, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.067 [-0.623, 1.176], mean_best_reward: --
 14824/100000: episode: 453, duration: 0.103s, episode steps: 20, steps per second: 195, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.081 [-1.203, 0.631], mean_best_reward: --
 14877/100000: episode: 454, duration: 0.251s, episode steps: 53, steps per second: 211, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.011 [-1.279, 1.104], mean_best_reward: --
 14908/1000

 15963/100000: episode: 485, duration: 0.295s, episode steps: 48, steps per second: 162, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.027 [-1.160, 0.953], mean_best_reward: --
 16019/100000: episode: 486, duration: 0.285s, episode steps: 56, steps per second: 197, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.053 [-0.603, 1.023], mean_best_reward: --
 16052/100000: episode: 487, duration: 0.179s, episode steps: 33, steps per second: 185, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.087 [-0.585, 1.296], mean_best_reward: --
 16090/100000: episode: 488, duration: 0.181s, episode steps: 38, steps per second: 210, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.553 [0.000, 1.000], mean observation: 0.006 [-1.511, 0.993], mean_best_reward: --
 16131/100000: episo

 17205/100000: episode: 519, duration: 0.210s, episode steps: 37, steps per second: 176, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: -0.091 [-1.455, 0.569], mean_best_reward: --
 17242/100000: episode: 520, duration: 0.214s, episode steps: 37, steps per second: 173, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: -0.131 [-1.813, 0.579], mean_best_reward: --
 17264/100000: episode: 521, duration: 0.119s, episode steps: 22, steps per second: 185, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.094 [-1.102, 0.591], mean_best_reward: --
 17290/100000: episode: 522, duration: 0.152s, episode steps: 26, steps per second: 171, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.423 [0.000, 1.000], mean observation: 0.020 [-1.178, 1.787], mean_best_reward: --
 17324/100000: ep

 18569/100000: episode: 554, duration: 0.186s, episode steps: 28, steps per second: 151, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.116 [-0.547, 0.944], mean_best_reward: --
 18597/100000: episode: 555, duration: 0.190s, episode steps: 28, steps per second: 148, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: 0.041 [-0.774, 1.212], mean_best_reward: --
 18630/100000: episode: 556, duration: 0.197s, episode steps: 33, steps per second: 167, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: -0.161 [-1.193, 0.739], mean_best_reward: --
 18686/100000: episode: 557, duration: 0.362s, episode steps: 56, steps per second: 155, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.241 [-0.941, 1.508], mean_best_reward: --
 18705/100000: epis

 19710/100000: episode: 588, duration: 0.386s, episode steps: 51, steps per second: 132, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.090 [-0.519, 1.057], mean_best_reward: --
 19748/100000: episode: 589, duration: 0.259s, episode steps: 38, steps per second: 147, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.057 [-1.420, 0.743], mean_best_reward: --
 19767/100000: episode: 590, duration: 0.140s, episode steps: 19, steps per second: 135, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.579 [0.000, 1.000], mean observation: -0.085 [-1.318, 0.642], mean_best_reward: --
 19891/100000: episode: 591, duration: 0.714s, episode steps: 124, steps per second: 174, episode reward: 124.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: -0.003 [-0.990, 1.086], mean_best_reward: --
 19915/100000: 

 21249/100000: episode: 622, duration: 0.217s, episode steps: 41, steps per second: 189, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.149 [-0.727, 1.547], mean_best_reward: --
 21265/100000: episode: 623, duration: 0.102s, episode steps: 16, steps per second: 156, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-1.069, 0.616], mean_best_reward: --
 21282/100000: episode: 624, duration: 0.147s, episode steps: 17, steps per second: 116, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.087 [-0.764, 1.288], mean_best_reward: --
 21344/100000: episode: 625, duration: 0.382s, episode steps: 62, steps per second: 162, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.085 [-1.204, 0.457], mean_best_reward: --
 21361/100000: epi

 22446/100000: episode: 657, duration: 0.104s, episode steps: 22, steps per second: 212, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.078 [-0.431, 1.051], mean_best_reward: --
 22466/100000: episode: 658, duration: 0.094s, episode steps: 20, steps per second: 213, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.038 [-1.614, 1.195], mean_best_reward: --
 22492/100000: episode: 659, duration: 0.120s, episode steps: 26, steps per second: 216, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.081 [-1.302, 0.600], mean_best_reward: --
 22529/100000: episode: 660, duration: 0.170s, episode steps: 37, steps per second: 217, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.077 [-0.827, 0.384], mean_best_reward: --
 22543/100000: ep

 23817/100000: episode: 691, duration: 0.232s, episode steps: 47, steps per second: 203, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: 0.066 [-0.783, 1.055], mean_best_reward: --
 23869/100000: episode: 692, duration: 0.284s, episode steps: 52, steps per second: 183, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.060 [-0.473, 1.210], mean_best_reward: --
 23949/100000: episode: 693, duration: 0.396s, episode steps: 80, steps per second: 202, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.041 [-1.478, 0.774], mean_best_reward: --
 23961/100000: episode: 694, duration: 0.057s, episode steps: 12, steps per second: 210, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.103 [-1.423, 2.246], mean_best_reward: --
 23990/100000: epis

 25083/100000: episode: 726, duration: 0.128s, episode steps: 27, steps per second: 211, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.092 [-0.758, 1.675], mean_best_reward: --
 25109/100000: episode: 727, duration: 0.127s, episode steps: 26, steps per second: 206, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.088 [-0.965, 0.396], mean_best_reward: --
 25155/100000: episode: 728, duration: 0.248s, episode steps: 46, steps per second: 185, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.112 [-0.654, 1.172], mean_best_reward: --
 25189/100000: episode: 729, duration: 0.191s, episode steps: 34, steps per second: 178, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.015 [-1.391, 1.000], mean_best_reward: --
 25217/100000: epi

 26370/100000: episode: 760, duration: 0.219s, episode steps: 38, steps per second: 174, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.110 [-1.006, 0.563], mean_best_reward: --
 26415/100000: episode: 761, duration: 0.209s, episode steps: 45, steps per second: 216, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.422 [0.000, 1.000], mean observation: 0.032 [-1.749, 2.489], mean_best_reward: --
 26435/100000: episode: 762, duration: 0.106s, episode steps: 20, steps per second: 189, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.083 [-1.387, 0.772], mean_best_reward: --
 26475/100000: episode: 763, duration: 0.257s, episode steps: 40, steps per second: 155, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: -0.027 [-1.200, 1.529], mean_best_reward: --
 26490/100000: ep

 27680/100000: episode: 794, duration: 0.301s, episode steps: 66, steps per second: 219, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.168 [-0.805, 1.115], mean_best_reward: --
 27708/100000: episode: 795, duration: 0.143s, episode steps: 28, steps per second: 195, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.092 [-0.584, 1.168], mean_best_reward: --
 27783/100000: episode: 796, duration: 0.415s, episode steps: 75, steps per second: 181, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.142 [-1.420, 1.165], mean_best_reward: --
 27800/100000: episode: 797, duration: 0.084s, episode steps: 17, steps per second: 202, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.089 [-0.807, 1.218], mean_best_reward: --
 27829/100000: epis

 29190/100000: episode: 828, duration: 0.284s, episode steps: 52, steps per second: 183, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.104 [-1.127, 0.526], mean_best_reward: --
 29223/100000: episode: 829, duration: 0.169s, episode steps: 33, steps per second: 195, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: -0.093 [-0.927, 0.428], mean_best_reward: --
 29247/100000: episode: 830, duration: 0.166s, episode steps: 24, steps per second: 144, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.128 [-1.039, 0.567], mean_best_reward: --
 29256/100000: episode: 831, duration: 0.075s, episode steps: 9, steps per second: 121, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.125 [-1.410, 2.178], mean_best_reward: --
 29284/100000: epis

 30434/100000: episode: 862, duration: 0.323s, episode steps: 59, steps per second: 183, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.030 [-1.185, 0.800], mean_best_reward: --
 30500/100000: episode: 863, duration: 0.380s, episode steps: 66, steps per second: 174, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.183 [-1.166, 1.030], mean_best_reward: --
 30550/100000: episode: 864, duration: 0.232s, episode steps: 50, steps per second: 215, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.049 [-1.117, 1.111], mean_best_reward: --
 30567/100000: episode: 865, duration: 0.103s, episode steps: 17, steps per second: 164, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.647 [0.000, 1.000], mean observation: -0.106 [-1.822, 0.974], mean_best_reward: --
 30582/100000: e

 31786/100000: episode: 897, duration: 0.132s, episode steps: 29, steps per second: 220, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: -0.070 [-1.216, 0.600], mean_best_reward: --
 31856/100000: episode: 898, duration: 0.316s, episode steps: 70, steps per second: 221, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.069 [-0.883, 1.196], mean_best_reward: --
 31888/100000: episode: 899, duration: 0.149s, episode steps: 32, steps per second: 214, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.562 [0.000, 1.000], mean observation: -0.096 [-1.938, 0.945], mean_best_reward: --
 31929/100000: episode: 900, duration: 0.193s, episode steps: 41, steps per second: 212, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.022 [-0.767, 1.314], mean_best_reward: --
 31946/100000: epi

 32975/100000: episode: 931, duration: 0.159s, episode steps: 33, steps per second: 208, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.071 [-1.057, 0.782], mean_best_reward: --
 32991/100000: episode: 932, duration: 0.077s, episode steps: 16, steps per second: 208, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.118 [-1.784, 0.943], mean_best_reward: --
 33036/100000: episode: 933, duration: 0.213s, episode steps: 45, steps per second: 211, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.099 [-1.185, 0.828], mean_best_reward: --
 33065/100000: episode: 934, duration: 0.135s, episode steps: 29, steps per second: 214, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.095 [-0.381, 0.840], mean_best_reward: --
 33086/100000: ep

 34281/100000: episode: 966, duration: 0.401s, episode steps: 83, steps per second: 207, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.022 [-1.379, 0.978], mean_best_reward: --
 34338/100000: episode: 967, duration: 0.306s, episode steps: 57, steps per second: 186, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.232 [-1.398, 0.575], mean_best_reward: --
 34364/100000: episode: 968, duration: 0.153s, episode steps: 26, steps per second: 169, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.081 [-0.782, 1.258], mean_best_reward: --
 34380/100000: episode: 969, duration: 0.090s, episode steps: 16, steps per second: 177, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.075 [-1.875, 1.216], mean_best_reward: --
 34486/100000: ep

 35788/100000: episode: 1000, duration: 0.229s, episode steps: 48, steps per second: 209, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.054 [-1.195, 0.558], mean_best_reward: --
 35808/100000: episode: 1001, duration: 0.099s, episode steps: 20, steps per second: 203, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.081 [-1.136, 0.627], mean_best_reward: 87.000000
 35838/100000: episode: 1002, duration: 0.164s, episode steps: 30, steps per second: 183, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.567 [0.000, 1.000], mean observation: -0.103 [-1.889, 0.822], mean_best_reward: --
 35882/100000: episode: 1003, duration: 0.231s, episode steps: 44, steps per second: 190, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.096 [-1.048, 0.636], mean_best_reward: --
 3595

 37060/100000: episode: 1034, duration: 0.305s, episode steps: 37, steps per second: 121, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.036 [-1.267, 0.649], mean_best_reward: --
 37089/100000: episode: 1035, duration: 0.174s, episode steps: 29, steps per second: 167, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.063 [-0.597, 1.032], mean_best_reward: --
 37141/100000: episode: 1036, duration: 0.260s, episode steps: 52, steps per second: 200, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.120 [-0.506, 1.341], mean_best_reward: --
 37191/100000: episode: 1037, duration: 0.229s, episode steps: 50, steps per second: 218, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.029 [-0.775, 1.340], mean_best_reward: --
 37227/100000: 

 38494/100000: episode: 1068, duration: 0.174s, episode steps: 37, steps per second: 213, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.105 [-0.396, 1.077], mean_best_reward: --
 38518/100000: episode: 1069, duration: 0.128s, episode steps: 24, steps per second: 188, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.069 [-1.615, 0.825], mean_best_reward: --
 38571/100000: episode: 1070, duration: 0.270s, episode steps: 53, steps per second: 196, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: 0.054 [-0.970, 1.722], mean_best_reward: --
 38592/100000: episode: 1071, duration: 0.116s, episode steps: 21, steps per second: 181, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.043 [-1.436, 1.004], mean_best_reward: --
 38658/100000:

 39744/100000: episode: 1102, duration: 0.341s, episode steps: 56, steps per second: 164, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: -0.071 [-1.068, 0.819], mean_best_reward: --
 39778/100000: episode: 1103, duration: 0.167s, episode steps: 34, steps per second: 204, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.138 [-1.161, 0.703], mean_best_reward: --
 39944/100000: episode: 1104, duration: 0.792s, episode steps: 166, steps per second: 210, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.117 [-1.401, 1.009], mean_best_reward: --
 39959/100000: episode: 1105, duration: 0.087s, episode steps: 15, steps per second: 173, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.084 [-1.787, 1.153], mean_best_reward: --
 39979/100

 41153/100000: episode: 1136, duration: 0.413s, episode steps: 83, steps per second: 201, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.112 [-0.792, 1.078], mean_best_reward: --
 41180/100000: episode: 1137, duration: 0.168s, episode steps: 27, steps per second: 160, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.080 [-1.235, 0.811], mean_best_reward: --
 41219/100000: episode: 1138, duration: 0.247s, episode steps: 39, steps per second: 158, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.116 [-1.072, 0.713], mean_best_reward: --
 41267/100000: episode: 1139, duration: 0.293s, episode steps: 48, steps per second: 164, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.088 [-1.107, 0.895], mean_best_reward: --
 41327/100000

 42567/100000: episode: 1170, duration: 0.260s, episode steps: 52, steps per second: 200, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.068 [-0.938, 0.585], mean_best_reward: --
 42619/100000: episode: 1171, duration: 0.344s, episode steps: 52, steps per second: 151, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.086 [-0.957, 1.160], mean_best_reward: --
 42639/100000: episode: 1172, duration: 0.141s, episode steps: 20, steps per second: 142, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.068 [-0.981, 1.698], mean_best_reward: --
 42656/100000: episode: 1173, duration: 0.085s, episode steps: 17, steps per second: 201, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.055 [-0.996, 1.554], mean_best_reward: --
 42681/100000: 

 44108/100000: episode: 1205, duration: 0.264s, episode steps: 32, steps per second: 121, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.103 [-1.301, 0.506], mean_best_reward: --
 44135/100000: episode: 1206, duration: 0.207s, episode steps: 27, steps per second: 130, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.068 [-1.282, 0.632], mean_best_reward: --
 44166/100000: episode: 1207, duration: 0.147s, episode steps: 31, steps per second: 211, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.085 [-0.338, 0.988], mean_best_reward: --
 44232/100000: episode: 1208, duration: 0.310s, episode steps: 66, steps per second: 213, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.169 [-0.667, 0.969], mean_best_reward: --
 44270/100000:

 45707/100000: episode: 1239, duration: 0.265s, episode steps: 45, steps per second: 170, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.088 [-1.182, 0.569], mean_best_reward: --
 45764/100000: episode: 1240, duration: 0.274s, episode steps: 57, steps per second: 208, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.028 [-0.963, 1.040], mean_best_reward: --
 45851/100000: episode: 1241, duration: 0.411s, episode steps: 87, steps per second: 211, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.080 [-1.524, 0.783], mean_best_reward: --
 45863/100000: episode: 1242, duration: 0.059s, episode steps: 12, steps per second: 205, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.115 [-1.608, 0.961], mean_best_reward: --
 45887/10000

 46857/100000: episode: 1273, duration: 0.076s, episode steps: 15, steps per second: 198, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.077 [-0.834, 1.376], mean_best_reward: --
 46886/100000: episode: 1274, duration: 0.134s, episode steps: 29, steps per second: 217, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.094 [-0.595, 1.159], mean_best_reward: --
 46919/100000: episode: 1275, duration: 0.161s, episode steps: 33, steps per second: 205, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.025 [-1.280, 0.941], mean_best_reward: --
 46958/100000: episode: 1276, duration: 0.181s, episode steps: 39, steps per second: 215, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: -0.081 [-1.029, 0.713], mean_best_reward: --
 47011/100000:

 48246/100000: episode: 1307, duration: 0.320s, episode steps: 60, steps per second: 187, episode reward: 60.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.128 [-0.735, 1.353], mean_best_reward: --
 48287/100000: episode: 1308, duration: 0.228s, episode steps: 41, steps per second: 180, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.101 [-0.347, 0.746], mean_best_reward: --
 48303/100000: episode: 1309, duration: 0.098s, episode steps: 16, steps per second: 164, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.086 [-0.778, 1.296], mean_best_reward: --
 48362/100000: episode: 1310, duration: 0.338s, episode steps: 59, steps per second: 175, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.080 [-1.163, 0.612], mean_best_reward: --
 48417/100000: 

 49653/100000: episode: 1341, duration: 0.327s, episode steps: 60, steps per second: 183, episode reward: 60.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.009 [-1.250, 0.931], mean_best_reward: --
 49701/100000: episode: 1342, duration: 0.225s, episode steps: 48, steps per second: 213, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: -0.081 [-1.545, 0.421], mean_best_reward: --
 49740/100000: episode: 1343, duration: 0.216s, episode steps: 39, steps per second: 180, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.564 [0.000, 1.000], mean observation: 0.196 [-0.594, 1.155], mean_best_reward: --
 49786/100000: episode: 1344, duration: 0.246s, episode steps: 46, steps per second: 187, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.099 [-0.909, 1.244], mean_best_reward: --
 49816/100000: 

 51159/100000: episode: 1375, duration: 0.266s, episode steps: 53, steps per second: 199, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: -0.033 [-1.089, 0.557], mean_best_reward: --
 51181/100000: episode: 1376, duration: 0.119s, episode steps: 22, steps per second: 185, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: -0.078 [-1.062, 0.589], mean_best_reward: --
 51214/100000: episode: 1377, duration: 0.164s, episode steps: 33, steps per second: 201, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.046 [-0.601, 1.216], mean_best_reward: --
 51277/100000: episode: 1378, duration: 0.297s, episode steps: 63, steps per second: 212, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: 0.033 [-0.547, 1.181], mean_best_reward: --
 51321/100000:

 52516/100000: episode: 1409, duration: 0.248s, episode steps: 50, steps per second: 201, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.070 [-1.159, 0.696], mean_best_reward: --
 52539/100000: episode: 1410, duration: 0.119s, episode steps: 23, steps per second: 192, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.102 [-1.147, 0.577], mean_best_reward: --
 52561/100000: episode: 1411, duration: 0.154s, episode steps: 22, steps per second: 143, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.053 [-0.991, 1.624], mean_best_reward: --
 52608/100000: episode: 1412, duration: 0.290s, episode steps: 47, steps per second: 162, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.038 [-0.591, 1.142], mean_best_reward: --
 52706/100000:

 53704/100000: episode: 1443, duration: 0.157s, episode steps: 21, steps per second: 134, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.095 [-0.959, 1.869], mean_best_reward: --
 53721/100000: episode: 1444, duration: 0.090s, episode steps: 17, steps per second: 190, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.114 [-0.551, 1.038], mean_best_reward: --
 53786/100000: episode: 1445, duration: 0.390s, episode steps: 65, steps per second: 167, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.126 [-0.918, 0.563], mean_best_reward: --
 53868/100000: episode: 1446, duration: 0.457s, episode steps: 82, steps per second: 179, episode reward: 82.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.072 [-0.940, 1.372], mean_best_reward: --
 53900/100000:

 55240/100000: episode: 1478, duration: 0.336s, episode steps: 65, steps per second: 194, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.207 [-1.445, 0.589], mean_best_reward: --
 55268/100000: episode: 1479, duration: 0.149s, episode steps: 28, steps per second: 188, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.062 [-1.120, 0.824], mean_best_reward: --
 55327/100000: episode: 1480, duration: 0.283s, episode steps: 59, steps per second: 209, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.093 [-0.625, 1.484], mean_best_reward: --
 55369/100000: episode: 1481, duration: 0.218s, episode steps: 42, steps per second: 192, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.058 [-0.955, 1.576], mean_best_reward: --
 55433/100000:

 56520/100000: episode: 1513, duration: 0.143s, episode steps: 28, steps per second: 196, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: 0.095 [-0.448, 1.318], mean_best_reward: --
 56546/100000: episode: 1514, duration: 0.120s, episode steps: 26, steps per second: 217, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.091 [-0.998, 0.464], mean_best_reward: --
 56572/100000: episode: 1515, duration: 0.130s, episode steps: 26, steps per second: 200, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.120 [-1.210, 0.419], mean_best_reward: --
 56607/100000: episode: 1516, duration: 0.214s, episode steps: 35, steps per second: 163, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: 0.076 [-0.853, 1.760], mean_best_reward: --
 56626/100000:

 57857/100000: episode: 1547, duration: 0.447s, episode steps: 100, steps per second: 224, episode reward: 100.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.067 [-1.151, 0.771], mean_best_reward: --
 57922/100000: episode: 1548, duration: 0.299s, episode steps: 65, steps per second: 217, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.141 [-1.173, 0.507], mean_best_reward: --
 57935/100000: episode: 1549, duration: 0.065s, episode steps: 13, steps per second: 201, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.092 [-0.803, 1.345], mean_best_reward: --
 57988/100000: episode: 1550, duration: 0.241s, episode steps: 53, steps per second: 220, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.095 [-1.542, 1.457], mean_best_reward: --
 58017/1000

 59227/100000: episode: 1581, duration: 0.118s, episode steps: 24, steps per second: 204, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.115 [-1.337, 0.566], mean_best_reward: --
 59296/100000: episode: 1582, duration: 0.330s, episode steps: 69, steps per second: 209, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: 0.003 [-1.526, 1.003], mean_best_reward: --
 59314/100000: episode: 1583, duration: 0.088s, episode steps: 18, steps per second: 205, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.075 [-1.200, 0.807], mean_best_reward: --
 59342/100000: episode: 1584, duration: 0.138s, episode steps: 28, steps per second: 203, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.126 [-1.472, 0.399], mean_best_reward: --
 59359/100000

 60531/100000: episode: 1615, duration: 0.207s, episode steps: 28, steps per second: 135, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.087 [-0.949, 0.445], mean_best_reward: --
 60553/100000: episode: 1616, duration: 0.158s, episode steps: 22, steps per second: 139, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.101 [-1.683, 0.782], mean_best_reward: --
 60601/100000: episode: 1617, duration: 0.352s, episode steps: 48, steps per second: 136, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.101 [-1.195, 0.774], mean_best_reward: --
 60655/100000: episode: 1618, duration: 0.375s, episode steps: 54, steps per second: 144, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.200 [-0.579, 1.271], mean_best_reward: --
 60677/100000

 61910/100000: episode: 1649, duration: 0.791s, episode steps: 62, steps per second: 78, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: -0.136 [-1.680, 0.744], mean_best_reward: --
 61926/100000: episode: 1650, duration: 0.093s, episode steps: 16, steps per second: 171, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.102 [-1.787, 0.945], mean_best_reward: --
 61967/100000: episode: 1651, duration: 0.219s, episode steps: 41, steps per second: 187, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: -0.031 [-1.287, 0.787], mean_best_reward: 84.500000
 61982/100000: episode: 1652, duration: 0.074s, episode steps: 15, steps per second: 204, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.128 [-0.548, 1.060], mean_best_reward: --
 62040/

 63425/100000: episode: 1683, duration: 0.225s, episode steps: 46, steps per second: 204, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: -0.019 [-1.110, 1.413], mean_best_reward: --
 63466/100000: episode: 1684, duration: 0.226s, episode steps: 41, steps per second: 182, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.154 [-0.638, 1.860], mean_best_reward: --
 63560/100000: episode: 1685, duration: 0.658s, episode steps: 94, steps per second: 143, episode reward: 94.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.003 [-0.832, 0.931], mean_best_reward: --
 63591/100000: episode: 1686, duration: 0.154s, episode steps: 31, steps per second: 201, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.053 [-0.939, 1.293], mean_best_reward: --
 63619/100000:

 64663/100000: episode: 1718, duration: 0.358s, episode steps: 52, steps per second: 145, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.040 [-0.635, 1.096], mean_best_reward: --
 64737/100000: episode: 1719, duration: 0.404s, episode steps: 74, steps per second: 183, episode reward: 74.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.194 [-0.516, 1.108], mean_best_reward: --
 64754/100000: episode: 1720, duration: 0.107s, episode steps: 17, steps per second: 159, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.091 [-1.396, 0.792], mean_best_reward: --
 64779/100000: episode: 1721, duration: 0.155s, episode steps: 25, steps per second: 162, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.120 [-1.251, 0.763], mean_best_reward: --
 64803/100000:

 66126/100000: episode: 1753, duration: 0.217s, episode steps: 41, steps per second: 189, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.052 [-0.636, 0.927], mean_best_reward: --
 66157/100000: episode: 1754, duration: 0.168s, episode steps: 31, steps per second: 184, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.104 [-0.871, 0.415], mean_best_reward: --
 66173/100000: episode: 1755, duration: 0.097s, episode steps: 16, steps per second: 165, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.108 [-0.578, 1.243], mean_best_reward: --
 66212/100000: episode: 1756, duration: 0.211s, episode steps: 39, steps per second: 185, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.119 [-1.108, 1.149], mean_best_reward: --
 66257/100000: 

 67297/100000: episode: 1788, duration: 0.135s, episode steps: 21, steps per second: 156, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.078 [-1.057, 0.585], mean_best_reward: --
 67359/100000: episode: 1789, duration: 0.368s, episode steps: 62, steps per second: 168, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.055 [-0.671, 1.077], mean_best_reward: --
 67389/100000: episode: 1790, duration: 0.173s, episode steps: 30, steps per second: 173, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.061 [-1.227, 0.649], mean_best_reward: --
 67430/100000: episode: 1791, duration: 0.196s, episode steps: 41, steps per second: 209, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: -0.105 [-1.382, 0.507], mean_best_reward: --
 67470/100000

 68684/100000: episode: 1823, duration: 0.406s, episode steps: 88, steps per second: 217, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.027 [-1.166, 1.271], mean_best_reward: --
 68742/100000: episode: 1824, duration: 0.298s, episode steps: 58, steps per second: 194, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.122 [-0.885, 0.439], mean_best_reward: --
 68767/100000: episode: 1825, duration: 0.116s, episode steps: 25, steps per second: 215, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.114 [-0.558, 1.190], mean_best_reward: --
 68786/100000: episode: 1826, duration: 0.099s, episode steps: 19, steps per second: 192, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.077 [-1.485, 0.985], mean_best_reward: --
 68812/100000

 69881/100000: episode: 1857, duration: 0.289s, episode steps: 65, steps per second: 225, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.005 [-0.596, 1.130], mean_best_reward: --
 69962/100000: episode: 1858, duration: 0.370s, episode steps: 81, steps per second: 219, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: -0.093 [-1.075, 0.870], mean_best_reward: --
 70010/100000: episode: 1859, duration: 0.228s, episode steps: 48, steps per second: 211, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.061 [-0.967, 0.739], mean_best_reward: --
 70020/100000: episode: 1860, duration: 0.047s, episode steps: 10, steps per second: 211, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.128 [-0.771, 1.399], mean_best_reward: --
 70043/100000:

 71438/100000: episode: 1892, duration: 0.242s, episode steps: 48, steps per second: 198, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.167 [-0.457, 0.966], mean_best_reward: --
 71482/100000: episode: 1893, duration: 0.230s, episode steps: 44, steps per second: 192, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: 0.030 [-0.625, 1.200], mean_best_reward: --
 71495/100000: episode: 1894, duration: 0.065s, episode steps: 13, steps per second: 199, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.121 [-0.740, 1.305], mean_best_reward: --
 71517/100000: episode: 1895, duration: 0.116s, episode steps: 22, steps per second: 190, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.093 [-0.598, 0.944], mean_best_reward: --
 71540/100000: e

 72549/100000: episode: 1926, duration: 0.527s, episode steps: 110, steps per second: 209, episode reward: 110.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: -0.055 [-1.406, 0.992], mean_best_reward: --
 72601/100000: episode: 1927, duration: 0.248s, episode steps: 52, steps per second: 209, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.077 [-1.484, 0.782], mean_best_reward: --
 72635/100000: episode: 1928, duration: 0.162s, episode steps: 34, steps per second: 209, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.088 [-0.600, 1.340], mean_best_reward: --
 72713/100000: episode: 1929, duration: 0.352s, episode steps: 78, steps per second: 222, episode reward: 78.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.013 [-0.552, 1.390], mean_best_reward: --
 72750/10000

 74285/100000: episode: 1960, duration: 0.365s, episode steps: 69, steps per second: 189, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.003 [-1.005, 1.129], mean_best_reward: --
 74329/100000: episode: 1961, duration: 0.240s, episode steps: 44, steps per second: 183, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.236 [-0.924, 1.748], mean_best_reward: --
 74393/100000: episode: 1962, duration: 0.362s, episode steps: 64, steps per second: 177, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.082 [-1.091, 1.407], mean_best_reward: --
 74418/100000: episode: 1963, duration: 0.168s, episode steps: 25, steps per second: 148, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.088 [-0.439, 1.235], mean_best_reward: --
 74452/100000: 

 75566/100000: episode: 1994, duration: 0.269s, episode steps: 40, steps per second: 149, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.081 [-0.442, 0.891], mean_best_reward: --
 75601/100000: episode: 1995, duration: 0.228s, episode steps: 35, steps per second: 153, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.109 [-0.769, 1.611], mean_best_reward: --
 75619/100000: episode: 1996, duration: 0.095s, episode steps: 18, steps per second: 189, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.082 [-0.756, 1.360], mean_best_reward: --
 75656/100000: episode: 1997, duration: 0.215s, episode steps: 37, steps per second: 172, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.063 [-0.974, 0.591], mean_best_reward: --
 75690/100000: 

 76929/100000: episode: 2029, duration: 0.249s, episode steps: 51, steps per second: 205, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.081 [-1.265, 0.647], mean_best_reward: --
 76965/100000: episode: 2030, duration: 0.176s, episode steps: 36, steps per second: 205, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.064 [-0.462, 1.088], mean_best_reward: --
 76997/100000: episode: 2031, duration: 0.164s, episode steps: 32, steps per second: 195, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.075 [-0.780, 1.694], mean_best_reward: --
 77061/100000: episode: 2032, duration: 0.305s, episode steps: 64, steps per second: 210, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.029 [-0.917, 1.228], mean_best_reward: --
 77098/100000:

 78326/100000: episode: 2064, duration: 0.189s, episode steps: 38, steps per second: 201, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.068 [-1.004, 0.440], mean_best_reward: --
 78381/100000: episode: 2065, duration: 0.273s, episode steps: 55, steps per second: 202, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: 0.026 [-0.757, 1.591], mean_best_reward: --
 78432/100000: episode: 2066, duration: 0.231s, episode steps: 51, steps per second: 220, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.137 [-0.623, 0.974], mean_best_reward: --
 78491/100000: episode: 2067, duration: 0.296s, episode steps: 59, steps per second: 199, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.033 [-1.268, 1.823], mean_best_reward: --
 78510/100000: 

 79721/100000: episode: 2098, duration: 0.233s, episode steps: 48, steps per second: 206, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.134 [-0.496, 1.551], mean_best_reward: --
 79805/100000: episode: 2099, duration: 0.422s, episode steps: 84, steps per second: 199, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.220 [-1.843, 1.094], mean_best_reward: --
 79867/100000: episode: 2100, duration: 0.300s, episode steps: 62, steps per second: 207, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.174 [-1.468, 0.595], mean_best_reward: --
 79904/100000: episode: 2101, duration: 0.215s, episode steps: 37, steps per second: 172, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.119 [-0.997, 0.604], mean_best_reward: 82.000000
 79938

 81159/100000: episode: 2133, duration: 0.185s, episode steps: 40, steps per second: 217, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: -0.143 [-1.081, 0.624], mean_best_reward: --
 81177/100000: episode: 2134, duration: 0.094s, episode steps: 18, steps per second: 191, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.133 [-0.571, 1.041], mean_best_reward: --
 81222/100000: episode: 2135, duration: 0.235s, episode steps: 45, steps per second: 191, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.167 [-1.632, 0.756], mean_best_reward: --
 81283/100000: episode: 2136, duration: 0.305s, episode steps: 61, steps per second: 200, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.100 [-0.968, 0.629], mean_best_reward: --
 81299/100000

 82784/100000: episode: 2167, duration: 0.204s, episode steps: 37, steps per second: 181, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.096 [-1.086, 0.625], mean_best_reward: --
 82818/100000: episode: 2168, duration: 0.195s, episode steps: 34, steps per second: 174, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.082 [-0.952, 0.433], mean_best_reward: --
 82869/100000: episode: 2169, duration: 0.328s, episode steps: 51, steps per second: 156, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.134 [-1.237, 0.802], mean_best_reward: --
 82932/100000: episode: 2170, duration: 0.304s, episode steps: 63, steps per second: 207, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.137 [-0.716, 0.566], mean_best_reward: --
 82947/10000

 84010/100000: episode: 2201, duration: 0.125s, episode steps: 22, steps per second: 176, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.107 [-1.053, 0.609], mean_best_reward: 119.500000
 84039/100000: episode: 2202, duration: 0.161s, episode steps: 29, steps per second: 180, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.085 [-0.966, 1.482], mean_best_reward: --
 84074/100000: episode: 2203, duration: 0.213s, episode steps: 35, steps per second: 164, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.097 [-0.607, 0.995], mean_best_reward: --
 84146/100000: episode: 2204, duration: 0.449s, episode steps: 72, steps per second: 161, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: -0.056 [-1.632, 0.784], mean_best_reward: --
 84201

 85366/100000: episode: 2236, duration: 0.242s, episode steps: 38, steps per second: 157, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.156 [-0.585, 1.426], mean_best_reward: --
 85393/100000: episode: 2237, duration: 0.208s, episode steps: 27, steps per second: 130, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: -0.111 [-0.975, 0.549], mean_best_reward: --
 85425/100000: episode: 2238, duration: 0.159s, episode steps: 32, steps per second: 201, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.082 [-1.403, 0.941], mean_best_reward: --
 85452/100000: episode: 2239, duration: 0.186s, episode steps: 27, steps per second: 145, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.061 [-0.800, 1.432], mean_best_reward: --
 85494/100000:

 86905/100000: episode: 2270, duration: 0.202s, episode steps: 45, steps per second: 223, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.127 [-1.015, 0.652], mean_best_reward: --
 86944/100000: episode: 2271, duration: 0.175s, episode steps: 39, steps per second: 223, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: -0.010 [-0.781, 1.063], mean_best_reward: --
 86966/100000: episode: 2272, duration: 0.116s, episode steps: 22, steps per second: 189, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.097 [-0.578, 0.914], mean_best_reward: --
 86990/100000: episode: 2273, duration: 0.111s, episode steps: 24, steps per second: 216, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.077 [-1.702, 0.798], mean_best_reward: --
 87029/100000

 88347/100000: episode: 2304, duration: 0.562s, episode steps: 127, steps per second: 226, episode reward: 127.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.207 [-1.350, 2.152], mean_best_reward: --
 88371/100000: episode: 2305, duration: 0.116s, episode steps: 24, steps per second: 207, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.058 [-0.600, 1.326], mean_best_reward: --
 88401/100000: episode: 2306, duration: 0.139s, episode steps: 30, steps per second: 216, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.433 [0.000, 1.000], mean observation: -0.123 [-0.719, 0.414], mean_best_reward: --
 88430/100000: episode: 2307, duration: 0.132s, episode steps: 29, steps per second: 219, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.083 [-0.915, 0.591], mean_best_reward: --
 88454/10000

 89366/100000: episode: 2339, duration: 0.159s, episode steps: 35, steps per second: 220, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.113 [-1.243, 0.468], mean_best_reward: --
 89383/100000: episode: 2340, duration: 0.081s, episode steps: 17, steps per second: 209, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.072 [-0.985, 1.671], mean_best_reward: --
 89394/100000: episode: 2341, duration: 0.055s, episode steps: 11, steps per second: 200, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.121 [-0.953, 1.551], mean_best_reward: --
 89421/100000: episode: 2342, duration: 0.132s, episode steps: 27, steps per second: 204, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.079 [-1.664, 0.951], mean_best_reward: --
 89459/100000:

 90613/100000: episode: 2373, duration: 0.245s, episode steps: 55, steps per second: 224, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: 0.004 [-1.249, 1.001], mean_best_reward: --
 90677/100000: episode: 2374, duration: 0.283s, episode steps: 64, steps per second: 226, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.088 [-0.476, 0.890], mean_best_reward: --
 90706/100000: episode: 2375, duration: 0.132s, episode steps: 29, steps per second: 220, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.101 [-1.083, 0.492], mean_best_reward: --
 90735/100000: episode: 2376, duration: 0.142s, episode steps: 29, steps per second: 204, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.414 [0.000, 1.000], mean observation: 0.049 [-1.031, 1.765], mean_best_reward: --
 90762/100000: 

 91956/100000: episode: 2408, duration: 0.129s, episode steps: 28, steps per second: 217, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.090 [-1.854, 0.763], mean_best_reward: --
 92063/100000: episode: 2409, duration: 0.479s, episode steps: 107, steps per second: 223, episode reward: 107.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.155 [-0.994, 1.149], mean_best_reward: --
 92104/100000: episode: 2410, duration: 0.183s, episode steps: 41, steps per second: 224, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.106 [-0.978, 0.764], mean_best_reward: --
 92133/100000: episode: 2411, duration: 0.134s, episode steps: 29, steps per second: 217, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.552 [0.000, 1.000], mean observation: 0.084 [-0.637, 1.113], mean_best_reward: --
 92161/10000

 93358/100000: episode: 2443, duration: 0.591s, episode steps: 131, steps per second: 222, episode reward: 131.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.034 [-1.140, 0.965], mean_best_reward: --
 93396/100000: episode: 2444, duration: 0.173s, episode steps: 38, steps per second: 219, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.125 [-0.359, 0.999], mean_best_reward: --
 93476/100000: episode: 2445, duration: 0.375s, episode steps: 80, steps per second: 213, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: 0.103 [-0.771, 1.689], mean_best_reward: --
 93510/100000: episode: 2446, duration: 0.155s, episode steps: 34, steps per second: 220, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.441 [0.000, 1.000], mean observation: 0.041 [-0.965, 1.744], mean_best_reward: --
 93543/100000

 94724/100000: episode: 2478, duration: 0.087s, episode steps: 18, steps per second: 208, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.064 [-1.907, 1.163], mean_best_reward: --
 94772/100000: episode: 2479, duration: 0.215s, episode steps: 48, steps per second: 223, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.113 [-0.468, 1.019], mean_best_reward: --
 94815/100000: episode: 2480, duration: 0.194s, episode steps: 43, steps per second: 222, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: -0.116 [-1.268, 0.799], mean_best_reward: --
 94834/100000: episode: 2481, duration: 0.097s, episode steps: 19, steps per second: 195, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.061 [-0.831, 1.520], mean_best_reward: --
 94902/100000:

 96138/100000: episode: 2513, duration: 0.069s, episode steps: 14, steps per second: 204, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.089 [-1.613, 1.019], mean_best_reward: --
 96162/100000: episode: 2514, duration: 0.111s, episode steps: 24, steps per second: 216, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.110 [-1.603, 0.648], mean_best_reward: --
 96213/100000: episode: 2515, duration: 0.243s, episode steps: 51, steps per second: 210, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.128 [-0.496, 1.316], mean_best_reward: --
 96232/100000: episode: 2516, duration: 0.088s, episode steps: 19, steps per second: 215, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.108 [-0.607, 1.213], mean_best_reward: --
 96260/100000:

 97305/100000: episode: 2548, duration: 0.148s, episode steps: 32, steps per second: 217, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.107 [-0.917, 0.631], mean_best_reward: --
 97347/100000: episode: 2549, duration: 0.189s, episode steps: 42, steps per second: 222, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.123 [-1.155, 0.773], mean_best_reward: --
 97369/100000: episode: 2550, duration: 0.104s, episode steps: 22, steps per second: 212, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.062 [-1.341, 0.827], mean_best_reward: --
 97407/100000: episode: 2551, duration: 0.171s, episode steps: 38, steps per second: 223, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.145 [-0.929, 0.618], mean_best_reward: 79.000000
 9743

 98603/100000: episode: 2582, duration: 0.313s, episode steps: 54, steps per second: 172, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.079 [-1.535, 0.608], mean_best_reward: --
 98639/100000: episode: 2583, duration: 0.192s, episode steps: 36, steps per second: 188, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.101 [-0.924, 0.538], mean_best_reward: --
 98661/100000: episode: 2584, duration: 0.129s, episode steps: 22, steps per second: 171, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.591 [0.000, 1.000], mean observation: -0.038 [-2.211, 1.531], mean_best_reward: --
 98711/100000: episode: 2585, duration: 0.276s, episode steps: 50, steps per second: 181, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.133 [-0.935, 1.336], mean_best_reward: --
 98794/100000

<tensorflow.python.keras.callbacks.History at 0x7fb3f0efe640>

After training is done, we save the best weights.

In [8]:
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

Finally, evaluate our algorithm for 5 episodes.

In [9]:
cem.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 124.000, steps: 124
Episode 2: reward: 51.000, steps: 51
Episode 3: reward: 57.000, steps: 57
Episode 4: reward: 53.000, steps: 53
Episode 5: reward: 60.000, steps: 60


<tensorflow.python.keras.callbacks.History at 0x7fb3f1099190>

Option 2: Deep network

In [10]:
 model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

Model Summary.

In [11]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)               

Defining the memory.

In [12]:
memory = EpisodeParameterMemory(limit=1000, window_length=1)

Model Definition.

In [13]:
cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)

Compiling the model.

In [14]:
cem.compile()

Fitting the model.

In [15]:
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

Training for 100000 steps ...
    24/100000: episode: 1, duration: 0.170s, episode steps: 24, steps per second: 141, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.062 [-1.615, 1.017], mean_best_reward: --
    36/100000: episode: 2, duration: 0.091s, episode steps: 12, steps per second: 132, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.118 [-2.122, 1.199], mean_best_reward: --
    48/100000: episode: 3, duration: 0.073s, episode steps: 12, steps per second: 165, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.103 [-2.966, 1.913], mean_best_reward: --
    66/100000: episode: 4, duration: 0.088s, episode steps: 18, steps per second: 204, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.096 [-1.160, 2.019], mean_best_reward

   571/100000: episode: 37, duration: 0.091s, episode steps: 19, steps per second: 209, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.316 [0.000, 1.000], mean observation: 0.110 [-1.344, 2.415], mean_best_reward: --
   585/100000: episode: 38, duration: 0.075s, episode steps: 14, steps per second: 187, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.073 [-1.861, 1.221], mean_best_reward: --
   598/100000: episode: 39, duration: 0.062s, episode steps: 13, steps per second: 208, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.115 [-1.340, 2.237], mean_best_reward: --
   608/100000: episode: 40, duration: 0.052s, episode steps: 10, steps per second: 194, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.136 [-1.181, 2.058], mean_best_reward: --
   621/100000: episode:

  1126/100000: episode: 71, duration: 0.094s, episode steps: 15, steps per second: 159, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.101 [-1.805, 2.847], mean_best_reward: --
  1135/100000: episode: 72, duration: 0.048s, episode steps: 9, steps per second: 187, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.143 [-2.287, 1.337], mean_best_reward: --
  1144/100000: episode: 73, duration: 0.054s, episode steps: 9, steps per second: 168, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.147 [-1.348, 2.229], mean_best_reward: --
  1158/100000: episode: 74, duration: 0.091s, episode steps: 14, steps per second: 154, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.857 [0.000, 1.000], mean observation: -0.077 [-2.972, 1.988], mean_best_reward: --
  1171/100000: episode: 75

  1682/100000: episode: 106, duration: 0.113s, episode steps: 20, steps per second: 177, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.069 [-1.567, 2.604], mean_best_reward: --
  1696/100000: episode: 107, duration: 0.104s, episode steps: 14, steps per second: 134, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000], mean observation: -0.095 [-2.152, 1.353], mean_best_reward: --
  1711/100000: episode: 108, duration: 0.093s, episode steps: 15, steps per second: 162, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.085 [-1.011, 1.756], mean_best_reward: --
  1737/100000: episode: 109, duration: 0.158s, episode steps: 26, steps per second: 165, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.117 [-0.353, 0.848], mean_best_reward: --
  1753/100000: epis

  2210/100000: episode: 141, duration: 0.107s, episode steps: 23, steps per second: 214, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.565 [0.000, 1.000], mean observation: -0.112 [-1.610, 0.623], mean_best_reward: --
  2218/100000: episode: 142, duration: 0.048s, episode steps: 8, steps per second: 165, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.123 [-2.510, 1.603], mean_best_reward: --
  2228/100000: episode: 143, duration: 0.063s, episode steps: 10, steps per second: 158, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.142 [-2.435, 1.517], mean_best_reward: --
  2259/100000: episode: 144, duration: 0.149s, episode steps: 31, steps per second: 208, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.111 [-0.412, 1.010], mean_best_reward: --
  2269/100000: epis

  2837/100000: episode: 176, duration: 0.116s, episode steps: 17, steps per second: 147, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: -0.096 [-1.473, 0.757], mean_best_reward: --
  2851/100000: episode: 177, duration: 0.070s, episode steps: 14, steps per second: 201, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.092 [-1.581, 2.529], mean_best_reward: --
  2865/100000: episode: 178, duration: 0.069s, episode steps: 14, steps per second: 203, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000], mean observation: -0.087 [-2.080, 1.226], mean_best_reward: --
  2887/100000: episode: 179, duration: 0.106s, episode steps: 22, steps per second: 208, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.074 [-1.637, 0.976], mean_best_reward: --
  2895/100000: ep

  3471/100000: episode: 212, duration: 0.051s, episode steps: 10, steps per second: 197, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.124 [-2.993, 1.997], mean_best_reward: --
  3486/100000: episode: 213, duration: 0.071s, episode steps: 15, steps per second: 211, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.057 [-1.918, 1.220], mean_best_reward: --
  3508/100000: episode: 214, duration: 0.102s, episode steps: 22, steps per second: 216, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.101 [-0.617, 0.874], mean_best_reward: --
  3517/100000: episode: 215, duration: 0.051s, episode steps: 9, steps per second: 177, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.131 [-1.601, 2.458], mean_best_reward: --
  3537/100000: episo

  4155/100000: episode: 246, duration: 0.055s, episode steps: 11, steps per second: 198, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.117 [-1.388, 2.264], mean_best_reward: --
  4166/100000: episode: 247, duration: 0.054s, episode steps: 11, steps per second: 205, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.113 [-1.379, 2.226], mean_best_reward: --
  4177/100000: episode: 248, duration: 0.053s, episode steps: 11, steps per second: 207, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.111 [-1.784, 2.697], mean_best_reward: --
  4243/100000: episode: 249, duration: 0.302s, episode steps: 66, steps per second: 219, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.118 [-1.212, 0.696], mean_best_reward: --
  4259/100000: epis

  4755/100000: episode: 280, duration: 0.062s, episode steps: 12, steps per second: 193, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.115 [-1.552, 2.567], mean_best_reward: --
  4815/100000: episode: 281, duration: 0.276s, episode steps: 60, steps per second: 217, episode reward: 60.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.068 [-2.742, 2.299], mean_best_reward: --
  4832/100000: episode: 282, duration: 0.078s, episode steps: 17, steps per second: 218, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.294 [0.000, 1.000], mean observation: 0.072 [-1.407, 2.242], mean_best_reward: --
  4847/100000: episode: 283, duration: 0.071s, episode steps: 15, steps per second: 212, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.087 [-1.010, 1.513], mean_best_reward: --
  4869/100000: episo

  5469/100000: episode: 316, duration: 0.051s, episode steps: 10, steps per second: 198, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.146 [-1.557, 2.601], mean_best_reward: --
  5478/100000: episode: 317, duration: 0.046s, episode steps: 9, steps per second: 197, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.162 [-1.355, 2.315], mean_best_reward: --
  5497/100000: episode: 318, duration: 0.089s, episode steps: 19, steps per second: 213, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.052 [-0.995, 1.398], mean_best_reward: --
  5513/100000: episode: 319, duration: 0.078s, episode steps: 16, steps per second: 206, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.093 [-1.556, 2.611], mean_best_reward: --
  5534/100000: episode

  6076/100000: episode: 352, duration: 0.287s, episode steps: 64, steps per second: 223, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.547 [0.000, 1.000], mean observation: 0.187 [-0.752, 1.329], mean_best_reward: --
  6085/100000: episode: 353, duration: 0.047s, episode steps: 9, steps per second: 192, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.135 [-1.811, 2.850], mean_best_reward: --
  6104/100000: episode: 354, duration: 0.090s, episode steps: 19, steps per second: 211, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.106 [-0.561, 1.293], mean_best_reward: --
  6115/100000: episode: 355, duration: 0.057s, episode steps: 11, steps per second: 194, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.105 [-2.355, 1.422], mean_best_reward: --
  6126/100000: episod

  6638/100000: episode: 386, duration: 0.142s, episode steps: 30, steps per second: 211, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.433 [0.000, 1.000], mean observation: 0.032 [-0.985, 1.493], mean_best_reward: --
  6660/100000: episode: 387, duration: 0.103s, episode steps: 22, steps per second: 215, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.071 [-1.213, 0.741], mean_best_reward: --
  6674/100000: episode: 388, duration: 0.068s, episode steps: 14, steps per second: 205, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.119 [-0.557, 1.300], mean_best_reward: --
  6706/100000: episode: 389, duration: 0.155s, episode steps: 32, steps per second: 207, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.061 [-1.320, 0.610], mean_best_reward: --
  6802/100000: epi

  7525/100000: episode: 420, duration: 0.078s, episode steps: 15, steps per second: 193, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.098 [-2.372, 1.374], mean_best_reward: --
  7548/100000: episode: 421, duration: 0.106s, episode steps: 23, steps per second: 217, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.652 [0.000, 1.000], mean observation: -0.045 [-2.205, 1.384], mean_best_reward: --
  7618/100000: episode: 422, duration: 0.355s, episode steps: 70, steps per second: 197, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.099 [-0.793, 1.128], mean_best_reward: --
  7645/100000: episode: 423, duration: 0.142s, episode steps: 27, steps per second: 190, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.063 [-0.993, 0.600], mean_best_reward: --
  7774/100000: ep

  8654/100000: episode: 456, duration: 0.145s, episode steps: 30, steps per second: 207, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.128 [-0.390, 1.471], mean_best_reward: --
  8664/100000: episode: 457, duration: 0.050s, episode steps: 10, steps per second: 201, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.117 [-1.561, 2.520], mean_best_reward: --
  8677/100000: episode: 458, duration: 0.067s, episode steps: 13, steps per second: 194, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.140 [-2.037, 1.151], mean_best_reward: --
  8695/100000: episode: 459, duration: 0.101s, episode steps: 18, steps per second: 179, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.058 [-2.000, 1.198], mean_best_reward: --
  8704/100000: epi

  9379/100000: episode: 490, duration: 0.111s, episode steps: 20, steps per second: 179, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.068 [-1.006, 1.803], mean_best_reward: --
  9391/100000: episode: 491, duration: 0.059s, episode steps: 12, steps per second: 204, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.134 [-1.206, 2.092], mean_best_reward: --
  9402/100000: episode: 492, duration: 0.078s, episode steps: 11, steps per second: 142, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.106 [-1.414, 2.301], mean_best_reward: --
  9412/100000: episode: 493, duration: 0.052s, episode steps: 10, steps per second: 192, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.111 [-1.584, 2.373], mean_best_reward: --
  9455/100000: episo

 10508/100000: episode: 524, duration: 0.233s, episode steps: 49, steps per second: 210, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.136 [-0.957, 0.440], mean_best_reward: --
 10531/100000: episode: 525, duration: 0.116s, episode steps: 23, steps per second: 198, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.092 [-1.142, 0.612], mean_best_reward: --
 10555/100000: episode: 526, duration: 0.125s, episode steps: 24, steps per second: 193, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.057 [-1.589, 0.809], mean_best_reward: --
 10599/100000: episode: 527, duration: 0.223s, episode steps: 44, steps per second: 197, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.086 [-0.536, 0.940], mean_best_reward: --
 10632/100000: ep

 11616/100000: episode: 558, duration: 0.259s, episode steps: 57, steps per second: 220, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.062 [-0.499, 0.878], mean_best_reward: --
 11664/100000: episode: 559, duration: 0.217s, episode steps: 48, steps per second: 221, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.058 [-0.391, 1.122], mean_best_reward: --
 11724/100000: episode: 560, duration: 0.288s, episode steps: 60, steps per second: 209, episode reward: 60.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.126 [-0.780, 1.121], mean_best_reward: --
 11735/100000: episode: 561, duration: 0.056s, episode steps: 11, steps per second: 198, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.909 [0.000, 1.000], mean observation: -0.127 [-2.857, 1.783], mean_best_reward: --
 11745/100000: epis

 12593/100000: episode: 594, duration: 0.062s, episode steps: 12, steps per second: 193, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.101 [-2.477, 1.586], mean_best_reward: --
 12605/100000: episode: 595, duration: 0.063s, episode steps: 12, steps per second: 191, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.123 [-1.895, 1.137], mean_best_reward: --
 12643/100000: episode: 596, duration: 0.175s, episode steps: 38, steps per second: 217, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.091 [-1.432, 0.790], mean_best_reward: --
 12685/100000: episode: 597, duration: 0.188s, episode steps: 42, steps per second: 223, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.038 [-0.451, 0.955], mean_best_reward: --
 12724/100000: ep

 13434/100000: episode: 630, duration: 0.109s, episode steps: 18, steps per second: 165, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.611 [0.000, 1.000], mean observation: -0.087 [-1.681, 0.954], mean_best_reward: --
 13463/100000: episode: 631, duration: 0.133s, episode steps: 29, steps per second: 218, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: -0.055 [-1.057, 0.564], mean_best_reward: --
 13474/100000: episode: 632, duration: 0.056s, episode steps: 11, steps per second: 197, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.121 [-1.388, 2.259], mean_best_reward: --
 13486/100000: episode: 633, duration: 0.067s, episode steps: 12, steps per second: 180, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.111 [-1.549, 2.500], mean_best_reward: --
 13497/100000: epi

 14220/100000: episode: 664, duration: 0.107s, episode steps: 21, steps per second: 197, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.088 [-1.374, 0.613], mean_best_reward: --
 14255/100000: episode: 665, duration: 0.171s, episode steps: 35, steps per second: 205, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.079 [-1.118, 0.576], mean_best_reward: --
 14298/100000: episode: 666, duration: 0.208s, episode steps: 43, steps per second: 206, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.115 [-1.186, 0.476], mean_best_reward: --
 14329/100000: episode: 667, duration: 0.144s, episode steps: 31, steps per second: 216, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: 0.033 [-1.223, 1.864], mean_best_reward: --
 14346/100000: ep

 15188/100000: episode: 698, duration: 0.197s, episode steps: 42, steps per second: 213, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.074 [-1.121, 0.810], mean_best_reward: --
 15242/100000: episode: 699, duration: 0.244s, episode steps: 54, steps per second: 221, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.090 [-0.418, 0.729], mean_best_reward: --
 15315/100000: episode: 700, duration: 0.326s, episode steps: 73, steps per second: 224, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: 0.102 [-0.486, 1.697], mean_best_reward: --
 15330/100000: episode: 701, duration: 0.071s, episode steps: 15, steps per second: 210, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.093 [-2.211, 1.362], mean_best_reward: 113.000000
 15354/100

 16635/100000: episode: 733, duration: 0.057s, episode steps: 11, steps per second: 193, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.124 [-1.714, 0.967], mean_best_reward: --
 16680/100000: episode: 734, duration: 0.204s, episode steps: 45, steps per second: 220, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.053 [-1.140, 1.532], mean_best_reward: --
 16719/100000: episode: 735, duration: 0.195s, episode steps: 39, steps per second: 200, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: 0.083 [-0.765, 1.205], mean_best_reward: --
 16767/100000: episode: 736, duration: 0.233s, episode steps: 48, steps per second: 206, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.045 [-0.630, 0.931], mean_best_reward: --
 16807/100000: epis

 17964/100000: episode: 767, duration: 0.279s, episode steps: 55, steps per second: 197, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.158 [-0.502, 0.886], mean_best_reward: --
 17984/100000: episode: 768, duration: 0.103s, episode steps: 20, steps per second: 195, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.081 [-1.540, 0.822], mean_best_reward: --
 18025/100000: episode: 769, duration: 0.205s, episode steps: 41, steps per second: 200, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: -0.067 [-1.708, 0.649], mean_best_reward: --
 18063/100000: episode: 770, duration: 0.193s, episode steps: 38, steps per second: 197, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.077 [-0.877, 0.579], mean_best_reward: --
 18084/100000: ep

 19176/100000: episode: 801, duration: 0.114s, episode steps: 21, steps per second: 184, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.127 [-0.539, 0.961], mean_best_reward: 68.500000
 19186/100000: episode: 802, duration: 0.050s, episode steps: 10, steps per second: 200, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.142 [-1.993, 1.191], mean_best_reward: --
 19220/100000: episode: 803, duration: 0.154s, episode steps: 34, steps per second: 221, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.077 [-0.542, 0.992], mean_best_reward: --
 19247/100000: episode: 804, duration: 0.158s, episode steps: 27, steps per second: 171, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.073 [-1.120, 0.585], mean_best_reward: --
 19281/1000

 20331/100000: episode: 835, duration: 0.080s, episode steps: 13, steps per second: 163, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.119 [-2.114, 1.337], mean_best_reward: --
 20348/100000: episode: 836, duration: 0.084s, episode steps: 17, steps per second: 203, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.765 [0.000, 1.000], mean observation: -0.073 [-2.740, 1.720], mean_best_reward: --
 20396/100000: episode: 837, duration: 0.232s, episode steps: 48, steps per second: 207, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.067 [-1.225, 0.547], mean_best_reward: --
 20406/100000: episode: 838, duration: 0.050s, episode steps: 10, steps per second: 199, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.130 [-0.995, 1.624], mean_best_reward: --
 20427/100000: ep

 21224/100000: episode: 870, duration: 0.165s, episode steps: 31, steps per second: 188, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.010 [-1.177, 1.470], mean_best_reward: --
 21235/100000: episode: 871, duration: 0.059s, episode steps: 11, steps per second: 187, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.112 [-1.754, 2.738], mean_best_reward: --
 21246/100000: episode: 872, duration: 0.060s, episode steps: 11, steps per second: 183, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.107 [-1.353, 2.194], mean_best_reward: --
 21256/100000: episode: 873, duration: 0.050s, episode steps: 10, steps per second: 200, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.149 [-3.027, 1.934], mean_best_reward: --
 21300/100000: epi

 21873/100000: episode: 905, duration: 0.078s, episode steps: 14, steps per second: 179, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.286 [0.000, 1.000], mean observation: 0.117 [-1.145, 2.030], mean_best_reward: --
 21888/100000: episode: 906, duration: 0.087s, episode steps: 15, steps per second: 172, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.106 [-1.166, 0.614], mean_best_reward: --
 21926/100000: episode: 907, duration: 0.184s, episode steps: 38, steps per second: 206, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.553 [0.000, 1.000], mean observation: -0.095 [-2.091, 1.008], mean_best_reward: --
 21939/100000: episode: 908, duration: 0.065s, episode steps: 13, steps per second: 201, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.104 [-2.213, 1.360], mean_best_reward: --
 21950/100000: ep

 22944/100000: episode: 940, duration: 0.242s, episode steps: 54, steps per second: 223, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.103 [-0.534, 1.081], mean_best_reward: --
 22983/100000: episode: 941, duration: 0.177s, episode steps: 39, steps per second: 220, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.436 [0.000, 1.000], mean observation: -0.030 [-0.935, 1.393], mean_best_reward: --
 22999/100000: episode: 942, duration: 0.079s, episode steps: 16, steps per second: 203, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.562 [0.000, 1.000], mean observation: -0.096 [-1.396, 0.757], mean_best_reward: --
 23012/100000: episode: 943, duration: 0.063s, episode steps: 13, steps per second: 205, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.113 [-1.796, 1.011], mean_best_reward: --
 23026/100000: ep

 23990/100000: episode: 976, duration: 0.103s, episode steps: 21, steps per second: 204, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.080 [-0.582, 1.041], mean_best_reward: --
 24054/100000: episode: 977, duration: 0.302s, episode steps: 64, steps per second: 212, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.136 [-0.906, 0.854], mean_best_reward: --
 24108/100000: episode: 978, duration: 0.280s, episode steps: 54, steps per second: 193, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.093 [-0.912, 0.813], mean_best_reward: --
 24164/100000: episode: 979, duration: 0.257s, episode steps: 56, steps per second: 218, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.027 [-1.001, 0.733], mean_best_reward: --
 24201/100000: epi

 25180/100000: episode: 1010, duration: 0.094s, episode steps: 19, steps per second: 202, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.090 [-0.635, 1.098], mean_best_reward: --
 25285/100000: episode: 1011, duration: 0.537s, episode steps: 105, steps per second: 195, episode reward: 105.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.218 [-2.104, 1.823], mean_best_reward: --
 25299/100000: episode: 1012, duration: 0.069s, episode steps: 14, steps per second: 204, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.123 [-0.561, 1.104], mean_best_reward: --
 25312/100000: episode: 1013, duration: 0.065s, episode steps: 13, steps per second: 200, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.103 [-0.579, 1.199], mean_best_reward: --
 25399/100000

 26882/100000: episode: 1044, duration: 0.153s, episode steps: 31, steps per second: 203, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: -0.093 [-1.596, 0.779], mean_best_reward: --
 26922/100000: episode: 1045, duration: 0.194s, episode steps: 40, steps per second: 207, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.045 [-0.528, 1.107], mean_best_reward: --
 26951/100000: episode: 1046, duration: 0.137s, episode steps: 29, steps per second: 212, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.552 [0.000, 1.000], mean observation: -0.046 [-1.306, 0.806], mean_best_reward: --
 27005/100000: episode: 1047, duration: 0.245s, episode steps: 54, steps per second: 220, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.156 [-0.432, 1.052], mean_best_reward: --
 27019/100000:

 28408/100000: episode: 1078, duration: 0.609s, episode steps: 129, steps per second: 212, episode reward: 129.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.504 [0.000, 1.000], mean observation: 0.027 [-1.541, 1.064], mean_best_reward: --
 28418/100000: episode: 1079, duration: 0.052s, episode steps: 10, steps per second: 194, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.114 [-1.688, 0.972], mean_best_reward: --
 28509/100000: episode: 1080, duration: 0.409s, episode steps: 91, steps per second: 223, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.085 [-0.738, 0.907], mean_best_reward: --
 28565/100000: episode: 1081, duration: 0.256s, episode steps: 56, steps per second: 219, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.114 [-0.522, 1.375], mean_best_reward: --
 28626/100000

 29962/100000: episode: 1113, duration: 0.103s, episode steps: 22, steps per second: 213, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.052 [-1.518, 0.976], mean_best_reward: --
 30004/100000: episode: 1114, duration: 0.189s, episode steps: 42, steps per second: 222, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.155 [-0.956, 0.411], mean_best_reward: --
 30071/100000: episode: 1115, duration: 0.304s, episode steps: 67, steps per second: 220, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.100 [-0.960, 0.351], mean_best_reward: --
 30118/100000: episode: 1116, duration: 0.215s, episode steps: 47, steps per second: 218, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: 0.060 [-0.591, 1.520], mean_best_reward: --
 30166/100000

 31429/100000: episode: 1148, duration: 0.231s, episode steps: 46, steps per second: 199, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.030 [-0.998, 0.637], mean_best_reward: --
 31479/100000: episode: 1149, duration: 0.249s, episode steps: 50, steps per second: 201, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.109 [-0.538, 1.386], mean_best_reward: --
 31501/100000: episode: 1150, duration: 0.101s, episode steps: 22, steps per second: 217, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.121 [-1.335, 0.572], mean_best_reward: --
 31535/100000: episode: 1151, duration: 0.160s, episode steps: 34, steps per second: 212, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.142 [-0.713, 0.252], mean_best_reward: 173.500000
 3161

 32724/100000: episode: 1184, duration: 0.249s, episode steps: 48, steps per second: 193, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.039 [-0.826, 1.131], mean_best_reward: --
 32762/100000: episode: 1185, duration: 0.175s, episode steps: 38, steps per second: 217, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.042 [-0.602, 1.300], mean_best_reward: --
 32815/100000: episode: 1186, duration: 0.255s, episode steps: 53, steps per second: 208, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.175 [-0.416, 1.099], mean_best_reward: --
 32860/100000: episode: 1187, duration: 0.205s, episode steps: 45, steps per second: 220, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.090 [-0.491, 1.191], mean_best_reward: --
 32905/100000: e

 34284/100000: episode: 1218, duration: 0.310s, episode steps: 67, steps per second: 216, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.123 [-0.677, 0.753], mean_best_reward: --
 34309/100000: episode: 1219, duration: 0.122s, episode steps: 25, steps per second: 204, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.560 [0.000, 1.000], mean observation: -0.033 [-1.293, 0.999], mean_best_reward: --
 34321/100000: episode: 1220, duration: 0.060s, episode steps: 12, steps per second: 200, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.107 [-0.771, 1.426], mean_best_reward: --
 34349/100000: episode: 1221, duration: 0.134s, episode steps: 28, steps per second: 210, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.125 [-0.605, 1.070], mean_best_reward: --
 34391/100000: 

 35782/100000: episode: 1252, duration: 0.264s, episode steps: 39, steps per second: 147, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.110 [-0.409, 0.980], mean_best_reward: --
 35824/100000: episode: 1253, duration: 0.258s, episode steps: 42, steps per second: 163, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.115 [-0.372, 1.102], mean_best_reward: --
 35850/100000: episode: 1254, duration: 0.157s, episode steps: 26, steps per second: 166, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.033 [-0.636, 0.965], mean_best_reward: --
 35899/100000: episode: 1255, duration: 0.314s, episode steps: 49, steps per second: 156, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.112 [-0.427, 0.889], mean_best_reward: --
 35953/100000: e

 37304/100000: episode: 1286, duration: 0.152s, episode steps: 18, steps per second: 119, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.120 [-1.254, 0.418], mean_best_reward: --
 37330/100000: episode: 1287, duration: 0.136s, episode steps: 26, steps per second: 192, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.615 [0.000, 1.000], mean observation: -0.108 [-2.266, 1.151], mean_best_reward: --
 37370/100000: episode: 1288, duration: 0.200s, episode steps: 40, steps per second: 200, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: -0.141 [-0.815, 0.449], mean_best_reward: --
 37396/100000: episode: 1289, duration: 0.129s, episode steps: 26, steps per second: 202, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.074 [-0.602, 1.307], mean_best_reward: --
 37416/100000

 38730/100000: episode: 1321, duration: 0.178s, episode steps: 34, steps per second: 191, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.122 [-0.736, 1.244], mean_best_reward: --
 38769/100000: episode: 1322, duration: 0.210s, episode steps: 39, steps per second: 186, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: 0.093 [-0.381, 1.260], mean_best_reward: --
 38835/100000: episode: 1323, duration: 0.328s, episode steps: 66, steps per second: 201, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.081 [-0.625, 1.169], mean_best_reward: --
 38849/100000: episode: 1324, duration: 0.074s, episode steps: 14, steps per second: 190, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.112 [-1.609, 0.766], mean_best_reward: --
 38870/100000: 

 40048/100000: episode: 1355, duration: 0.254s, episode steps: 53, steps per second: 209, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.131 [-1.103, 0.560], mean_best_reward: --
 40064/100000: episode: 1356, duration: 0.081s, episode steps: 16, steps per second: 196, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.100 [-0.930, 1.456], mean_best_reward: --
 40096/100000: episode: 1357, duration: 0.156s, episode steps: 32, steps per second: 205, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.093 [-1.228, 0.541], mean_best_reward: --
 40111/100000: episode: 1358, duration: 0.084s, episode steps: 15, steps per second: 178, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.057 [-2.690, 1.804], mean_best_reward: --
 40121/100000

 40717/100000: episode: 1389, duration: 0.145s, episode steps: 29, steps per second: 200, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: -0.099 [-1.238, 0.396], mean_best_reward: --
 40732/100000: episode: 1390, duration: 0.086s, episode steps: 15, steps per second: 174, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.111 [-0.817, 1.532], mean_best_reward: --
 40751/100000: episode: 1391, duration: 0.107s, episode steps: 19, steps per second: 178, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.368 [0.000, 1.000], mean observation: 0.048 [-1.209, 1.735], mean_best_reward: --
 40767/100000: episode: 1392, duration: 0.084s, episode steps: 16, steps per second: 191, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.117 [-0.784, 1.618], mean_best_reward: --
 40786/100000: 

 42164/100000: episode: 1423, duration: 0.265s, episode steps: 54, steps per second: 203, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.129 [-0.955, 0.430], mean_best_reward: --
 42203/100000: episode: 1424, duration: 0.189s, episode steps: 39, steps per second: 207, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.103 [-1.168, 0.594], mean_best_reward: --
 42235/100000: episode: 1425, duration: 0.154s, episode steps: 32, steps per second: 208, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.100 [-1.152, 0.388], mean_best_reward: --
 42288/100000: episode: 1426, duration: 0.267s, episode steps: 53, steps per second: 198, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.057 [-1.329, 0.828], mean_best_reward: --
 42399/10000

 43856/100000: episode: 1457, duration: 0.267s, episode steps: 55, steps per second: 206, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.111 [-1.223, 0.783], mean_best_reward: --
 43870/100000: episode: 1458, duration: 0.074s, episode steps: 14, steps per second: 190, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.064 [-1.760, 1.194], mean_best_reward: --
 43902/100000: episode: 1459, duration: 0.170s, episode steps: 32, steps per second: 188, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.067 [-1.210, 0.630], mean_best_reward: --
 43968/100000: episode: 1460, duration: 0.313s, episode steps: 66, steps per second: 211, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.127 [-0.890, 1.173], mean_best_reward: --
 44027/100000

 45218/100000: episode: 1491, duration: 0.293s, episode steps: 63, steps per second: 215, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.172 [-0.430, 0.991], mean_best_reward: --
 45290/100000: episode: 1492, duration: 0.339s, episode steps: 72, steps per second: 213, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.076 [-0.810, 1.123], mean_best_reward: --
 45302/100000: episode: 1493, duration: 0.061s, episode steps: 12, steps per second: 198, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.125 [-2.593, 1.615], mean_best_reward: --
 45333/100000: episode: 1494, duration: 0.150s, episode steps: 31, steps per second: 207, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.548 [0.000, 1.000], mean observation: -0.031 [-1.508, 0.831], mean_best_reward: --
 45372/100000:

 46519/100000: episode: 1525, duration: 0.206s, episode steps: 44, steps per second: 214, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.123 [-1.328, 0.345], mean_best_reward: --
 46544/100000: episode: 1526, duration: 0.126s, episode steps: 25, steps per second: 199, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.086 [-1.204, 0.619], mean_best_reward: --
 46565/100000: episode: 1527, duration: 0.103s, episode steps: 21, steps per second: 203, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.098 [-0.561, 1.290], mean_best_reward: --
 46619/100000: episode: 1528, duration: 0.252s, episode steps: 54, steps per second: 215, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.088 [-1.303, 0.570], mean_best_reward: --
 46636/100000

 47918/100000: episode: 1560, duration: 0.181s, episode steps: 38, steps per second: 209, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.072 [-0.561, 1.011], mean_best_reward: --
 47949/100000: episode: 1561, duration: 0.146s, episode steps: 31, steps per second: 212, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.548 [0.000, 1.000], mean observation: 0.142 [-0.748, 1.240], mean_best_reward: --
 47982/100000: episode: 1562, duration: 0.157s, episode steps: 33, steps per second: 210, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.071 [-0.570, 1.339], mean_best_reward: --
 48011/100000: episode: 1563, duration: 0.146s, episode steps: 29, steps per second: 199, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.448 [0.000, 1.000], mean observation: -0.123 [-1.527, 0.935], mean_best_reward: --
 48044/100000: 

 49335/100000: episode: 1595, duration: 0.334s, episode steps: 71, steps per second: 213, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: -0.004 [-1.457, 1.313], mean_best_reward: --
 49365/100000: episode: 1596, duration: 0.146s, episode steps: 30, steps per second: 205, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.116 [-0.439, 1.428], mean_best_reward: --
 49465/100000: episode: 1597, duration: 0.474s, episode steps: 100, steps per second: 211, episode reward: 100.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.104 [-1.542, 0.611], mean_best_reward: --
 49524/100000: episode: 1598, duration: 0.280s, episode steps: 59, steps per second: 211, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.051 [-0.593, 1.185], mean_best_reward: --
 49595/10000

 50834/100000: episode: 1629, duration: 0.139s, episode steps: 28, steps per second: 201, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.064 [-0.949, 0.634], mean_best_reward: --
 50855/100000: episode: 1630, duration: 0.140s, episode steps: 21, steps per second: 150, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.107 [-0.644, 1.429], mean_best_reward: --
 50884/100000: episode: 1631, duration: 0.147s, episode steps: 29, steps per second: 198, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.621 [0.000, 1.000], mean observation: -0.035 [-2.137, 1.335], mean_best_reward: --
 50940/100000: episode: 1632, duration: 0.277s, episode steps: 56, steps per second: 202, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.589 [0.000, 1.000], mean observation: 0.129 [-2.058, 1.942], mean_best_reward: --
 50994/100000:

 52503/100000: episode: 1663, duration: 0.465s, episode steps: 101, steps per second: 217, episode reward: 101.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.308 [-0.835, 1.596], mean_best_reward: --
 52600/100000: episode: 1664, duration: 0.455s, episode steps: 97, steps per second: 213, episode reward: 97.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.030 [-1.263, 0.746], mean_best_reward: --
 52630/100000: episode: 1665, duration: 0.161s, episode steps: 30, steps per second: 186, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.107 [-0.920, 0.413], mean_best_reward: --
 52652/100000: episode: 1666, duration: 0.111s, episode steps: 22, steps per second: 198, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.054 [-1.355, 0.832], mean_best_reward: --
 52676/1000

 53830/100000: episode: 1698, duration: 0.234s, episode steps: 47, steps per second: 201, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.087 [-1.254, 0.433], mean_best_reward: --
 53862/100000: episode: 1699, duration: 0.151s, episode steps: 32, steps per second: 211, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.041 [-1.140, 0.626], mean_best_reward: --
 53884/100000: episode: 1700, duration: 0.122s, episode steps: 22, steps per second: 180, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.057 [-1.263, 0.808], mean_best_reward: --
 53914/100000: episode: 1701, duration: 0.152s, episode steps: 30, steps per second: 197, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.116 [-1.064, 0.355], mean_best_reward: 93.500000
 5398

 55514/100000: episode: 1732, duration: 0.258s, episode steps: 53, steps per second: 206, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.036 [-1.114, 0.794], mean_best_reward: --
 55536/100000: episode: 1733, duration: 0.114s, episode steps: 22, steps per second: 193, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.591 [0.000, 1.000], mean observation: -0.089 [-1.670, 0.758], mean_best_reward: --
 55570/100000: episode: 1734, duration: 0.168s, episode steps: 34, steps per second: 203, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.090 [-1.150, 0.578], mean_best_reward: --
 55633/100000: episode: 1735, duration: 0.311s, episode steps: 63, steps per second: 203, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.215 [-1.689, 0.369], mean_best_reward: --
 55673/10000

 56953/100000: episode: 1766, duration: 0.128s, episode steps: 27, steps per second: 212, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.096 [-0.614, 0.902], mean_best_reward: --
 57005/100000: episode: 1767, duration: 0.242s, episode steps: 52, steps per second: 214, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.114 [-1.003, 0.678], mean_best_reward: --
 57020/100000: episode: 1768, duration: 0.075s, episode steps: 15, steps per second: 199, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.089 [-2.166, 1.338], mean_best_reward: --
 57063/100000: episode: 1769, duration: 0.200s, episode steps: 43, steps per second: 215, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.076 [-1.069, 0.994], mean_best_reward: --
 57089/100000:

 58525/100000: episode: 1800, duration: 0.144s, episode steps: 29, steps per second: 201, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.552 [0.000, 1.000], mean observation: -0.063 [-1.890, 1.028], mean_best_reward: --
 58575/100000: episode: 1801, duration: 0.306s, episode steps: 50, steps per second: 163, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.123 [-0.250, 0.975], mean_best_reward: 101.000000
 58630/100000: episode: 1802, duration: 0.274s, episode steps: 55, steps per second: 201, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.089 [-0.925, 0.682], mean_best_reward: --
 58664/100000: episode: 1803, duration: 0.164s, episode steps: 34, steps per second: 207, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.441 [0.000, 1.000], mean observation: 0.086 [-0.951, 1.940], mean_best_reward: --
 58699

 60266/100000: episode: 1835, duration: 0.142s, episode steps: 24, steps per second: 169, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.050 [-0.931, 1.405], mean_best_reward: --
 60304/100000: episode: 1836, duration: 0.191s, episode steps: 38, steps per second: 199, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.107 [-0.732, 1.170], mean_best_reward: --
 60336/100000: episode: 1837, duration: 0.183s, episode steps: 32, steps per second: 175, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.138 [-1.323, 0.686], mean_best_reward: --
 60368/100000: episode: 1838, duration: 0.158s, episode steps: 32, steps per second: 202, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: 0.099 [-0.587, 1.491], mean_best_reward: --
 60402/100000: 

 61629/100000: episode: 1869, duration: 0.229s, episode steps: 46, steps per second: 201, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.137 [-1.052, 1.175], mean_best_reward: --
 61646/100000: episode: 1870, duration: 0.088s, episode steps: 17, steps per second: 193, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: -0.076 [-1.491, 0.819], mean_best_reward: --
 61703/100000: episode: 1871, duration: 0.370s, episode steps: 57, steps per second: 154, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.166 [-1.083, 0.715], mean_best_reward: --
 61790/100000: episode: 1872, duration: 0.429s, episode steps: 87, steps per second: 203, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.073 [-0.832, 0.916], mean_best_reward: --
 61883/100000

 63224/100000: episode: 1904, duration: 0.169s, episode steps: 32, steps per second: 189, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.072 [-0.456, 1.104], mean_best_reward: --
 63274/100000: episode: 1905, duration: 0.246s, episode steps: 50, steps per second: 203, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.144 [-0.404, 0.949], mean_best_reward: --
 63315/100000: episode: 1906, duration: 0.201s, episode steps: 41, steps per second: 204, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.104 [-1.398, 0.571], mean_best_reward: --
 63357/100000: episode: 1907, duration: 0.212s, episode steps: 42, steps per second: 198, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.051 [-1.545, 0.814], mean_best_reward: --
 63416/100000:

 64916/100000: episode: 1938, duration: 0.441s, episode steps: 91, steps per second: 206, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.082 [-0.896, 1.406], mean_best_reward: --
 64941/100000: episode: 1939, duration: 0.126s, episode steps: 25, steps per second: 198, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.104 [-0.582, 1.032], mean_best_reward: --
 64958/100000: episode: 1940, duration: 0.089s, episode steps: 17, steps per second: 192, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.588 [0.000, 1.000], mean observation: -0.103 [-1.391, 0.606], mean_best_reward: --
 64996/100000: episode: 1941, duration: 0.215s, episode steps: 38, steps per second: 176, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.100 [-0.970, 0.274], mean_best_reward: --
 65034/100000:

 66344/100000: episode: 1972, duration: 0.284s, episode steps: 58, steps per second: 204, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.140 [-1.021, 0.683], mean_best_reward: --
 66424/100000: episode: 1973, duration: 0.377s, episode steps: 80, steps per second: 212, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.148 [-0.647, 1.434], mean_best_reward: --
 66461/100000: episode: 1974, duration: 0.178s, episode steps: 37, steps per second: 208, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.038 [-0.961, 0.584], mean_best_reward: --
 66548/100000: episode: 1975, duration: 0.403s, episode steps: 87, steps per second: 216, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.108 [-0.743, 1.104], mean_best_reward: --
 66612/100000:

 67973/100000: episode: 2006, duration: 0.150s, episode steps: 29, steps per second: 194, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.091 [-0.617, 1.310], mean_best_reward: --
 68026/100000: episode: 2007, duration: 0.263s, episode steps: 53, steps per second: 202, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.115 [-0.745, 0.346], mean_best_reward: --
 68116/100000: episode: 2008, duration: 0.476s, episode steps: 90, steps per second: 189, episode reward: 90.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.020 [-1.253, 0.973], mean_best_reward: --
 68188/100000: episode: 2009, duration: 0.356s, episode steps: 72, steps per second: 202, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.059 [-0.761, 1.220], mean_best_reward: --
 68334/100000:

 69824/100000: episode: 2041, duration: 0.306s, episode steps: 58, steps per second: 189, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.072 [-0.864, 0.685], mean_best_reward: --
 69843/100000: episode: 2042, duration: 0.111s, episode steps: 19, steps per second: 172, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.083 [-0.943, 0.599], mean_best_reward: --
 69897/100000: episode: 2043, duration: 0.274s, episode steps: 54, steps per second: 197, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.014 [-0.960, 1.232], mean_best_reward: --
 69950/100000: episode: 2044, duration: 0.269s, episode steps: 53, steps per second: 197, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.042 [-0.792, 1.242], mean_best_reward: --
 69980/100000

 71608/100000: episode: 2075, duration: 0.304s, episode steps: 61, steps per second: 201, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.049 [-0.552, 0.844], mean_best_reward: --
 71660/100000: episode: 2076, duration: 0.250s, episode steps: 52, steps per second: 208, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.114 [-0.401, 1.245], mean_best_reward: --
 71722/100000: episode: 2077, duration: 0.309s, episode steps: 62, steps per second: 201, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.149 [-0.822, 1.141], mean_best_reward: --
 71749/100000: episode: 2078, duration: 0.134s, episode steps: 27, steps per second: 201, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.407 [0.000, 1.000], mean observation: 0.043 [-1.003, 1.791], mean_best_reward: --
 71812/100000: e

 72985/100000: episode: 2109, duration: 0.142s, episode steps: 29, steps per second: 204, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: -0.099 [-1.453, 0.574], mean_best_reward: --
 73000/100000: episode: 2110, duration: 0.075s, episode steps: 15, steps per second: 201, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.079 [-1.338, 0.824], mean_best_reward: --
 73047/100000: episode: 2111, duration: 0.227s, episode steps: 47, steps per second: 207, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: 0.099 [-0.831, 1.256], mean_best_reward: --
 73063/100000: episode: 2112, duration: 0.081s, episode steps: 16, steps per second: 199, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.688 [0.000, 1.000], mean observation: -0.123 [-2.154, 1.160], mean_best_reward: --
 73166/100000

 74345/100000: episode: 2143, duration: 0.323s, episode steps: 65, steps per second: 201, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.067 [-0.939, 0.527], mean_best_reward: --
 74370/100000: episode: 2144, duration: 0.123s, episode steps: 25, steps per second: 203, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.560 [0.000, 1.000], mean observation: 0.095 [-0.938, 1.419], mean_best_reward: --
 74400/100000: episode: 2145, duration: 0.154s, episode steps: 30, steps per second: 195, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.074 [-1.061, 0.552], mean_best_reward: --
 74423/100000: episode: 2146, duration: 0.120s, episode steps: 23, steps per second: 191, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.050 [-0.807, 1.279], mean_best_reward: --
 74440/100000:

 75902/100000: episode: 2177, duration: 0.363s, episode steps: 73, steps per second: 201, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.084 [-0.506, 1.123], mean_best_reward: --
 75944/100000: episode: 2178, duration: 0.218s, episode steps: 42, steps per second: 193, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.022 [-1.263, 0.741], mean_best_reward: --
 75988/100000: episode: 2179, duration: 0.246s, episode steps: 44, steps per second: 179, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.121 [-1.057, 0.346], mean_best_reward: --
 76028/100000: episode: 2180, duration: 0.203s, episode steps: 40, steps per second: 197, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.145 [-0.373, 0.790], mean_best_reward: --
 76055/100000:

 77310/100000: episode: 2211, duration: 0.262s, episode steps: 57, steps per second: 217, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.034 [-0.955, 0.433], mean_best_reward: --
 77350/100000: episode: 2212, duration: 0.193s, episode steps: 40, steps per second: 208, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.058 [-0.619, 0.962], mean_best_reward: --
 77388/100000: episode: 2213, duration: 0.174s, episode steps: 38, steps per second: 218, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.087 [-0.538, 1.166], mean_best_reward: --
 77465/100000: episode: 2214, duration: 0.366s, episode steps: 77, steps per second: 210, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.016 [-1.084, 0.745], mean_best_reward: --
 77564/100000: 

 78717/100000: episode: 2245, duration: 0.405s, episode steps: 88, steps per second: 217, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.118 [-0.595, 0.970], mean_best_reward: --
 78763/100000: episode: 2246, duration: 0.246s, episode steps: 46, steps per second: 187, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.084 [-0.596, 1.133], mean_best_reward: --
 78813/100000: episode: 2247, duration: 0.238s, episode steps: 50, steps per second: 210, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.078 [-0.488, 0.910], mean_best_reward: --
 78831/100000: episode: 2248, duration: 0.087s, episode steps: 18, steps per second: 206, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.389 [0.000, 1.000], mean observation: 0.074 [-0.775, 1.505], mean_best_reward: --
 78845/100000: e

 80398/100000: episode: 2279, duration: 0.225s, episode steps: 44, steps per second: 195, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.098 [-0.560, 1.055], mean_best_reward: --
 80419/100000: episode: 2280, duration: 0.104s, episode steps: 21, steps per second: 201, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.039 [-1.508, 1.003], mean_best_reward: --
 80481/100000: episode: 2281, duration: 0.297s, episode steps: 62, steps per second: 209, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.108 [-1.436, 0.551], mean_best_reward: --
 80681/100000: episode: 2282, duration: 0.975s, episode steps: 200, steps per second: 205, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.010 [-0.757, 0.914], mean_best_reward: --
 80699/10000

 82339/100000: episode: 2314, duration: 0.199s, episode steps: 37, steps per second: 186, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.033 [-0.630, 1.147], mean_best_reward: --
 82358/100000: episode: 2315, duration: 0.089s, episode steps: 19, steps per second: 214, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.684 [0.000, 1.000], mean observation: -0.031 [-2.231, 1.574], mean_best_reward: --
 82385/100000: episode: 2316, duration: 0.126s, episode steps: 27, steps per second: 214, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.073 [-0.638, 1.155], mean_best_reward: --
 82468/100000: episode: 2317, duration: 0.389s, episode steps: 83, steps per second: 213, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.166 [-1.366, 0.802], mean_best_reward: --
 82481/100000:

 83659/100000: episode: 2348, duration: 0.175s, episode steps: 28, steps per second: 160, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: 0.067 [-0.981, 1.780], mean_best_reward: --
 83680/100000: episode: 2349, duration: 0.127s, episode steps: 21, steps per second: 166, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.092 [-1.236, 0.630], mean_best_reward: --
 83707/100000: episode: 2350, duration: 0.169s, episode steps: 27, steps per second: 160, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.049 [-0.764, 1.181], mean_best_reward: --
 83766/100000: episode: 2351, duration: 0.337s, episode steps: 59, steps per second: 175, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.102 [-0.403, 0.745], mean_best_reward: 96.500000
 83797/1

 85107/100000: episode: 2382, duration: 0.123s, episode steps: 26, steps per second: 212, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.121 [-0.899, 0.375], mean_best_reward: --
 85136/100000: episode: 2383, duration: 0.145s, episode steps: 29, steps per second: 199, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.067 [-0.634, 1.059], mean_best_reward: --
 85192/100000: episode: 2384, duration: 0.252s, episode steps: 56, steps per second: 222, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: -0.101 [-1.149, 0.923], mean_best_reward: --
 85211/100000: episode: 2385, duration: 0.096s, episode steps: 19, steps per second: 197, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.579 [0.000, 1.000], mean observation: -0.042 [-1.715, 1.191], mean_best_reward: --
 85297/100000

 86531/100000: episode: 2416, duration: 0.098s, episode steps: 20, steps per second: 205, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.085 [-1.382, 0.800], mean_best_reward: --
 86566/100000: episode: 2417, duration: 0.161s, episode steps: 35, steps per second: 217, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: 0.128 [-0.432, 0.904], mean_best_reward: --
 86614/100000: episode: 2418, duration: 0.222s, episode steps: 48, steps per second: 216, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.091 [-0.905, 0.518], mean_best_reward: --
 86649/100000: episode: 2419, duration: 0.160s, episode steps: 35, steps per second: 219, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.090 [-1.140, 0.394], mean_best_reward: --
 86693/100000

 87973/100000: episode: 2451, duration: 0.164s, episode steps: 35, steps per second: 214, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: -0.007 [-1.566, 1.198], mean_best_reward: 99.000000
 88007/100000: episode: 2452, duration: 0.155s, episode steps: 34, steps per second: 220, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.135 [-0.385, 0.780], mean_best_reward: --
 88064/100000: episode: 2453, duration: 0.291s, episode steps: 57, steps per second: 196, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.138 [-0.421, 0.902], mean_best_reward: --
 88100/100000: episode: 2454, duration: 0.173s, episode steps: 36, steps per second: 208, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.084 [-0.419, 1.044], mean_best_reward: --
 88126/1

 89694/100000: episode: 2485, duration: 0.205s, episode steps: 46, steps per second: 224, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.090 [-0.898, 0.611], mean_best_reward: --
 89761/100000: episode: 2486, duration: 0.300s, episode steps: 67, steps per second: 223, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.080 [-0.594, 1.075], mean_best_reward: --
 89777/100000: episode: 2487, duration: 0.076s, episode steps: 16, steps per second: 211, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.107 [-1.982, 1.133], mean_best_reward: --
 89894/100000: episode: 2488, duration: 0.526s, episode steps: 117, steps per second: 223, episode reward: 117.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: -0.020 [-1.395, 0.862], mean_best_reward: --
 89918/1000

 91241/100000: episode: 2520, duration: 0.128s, episode steps: 27, steps per second: 211, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.069 [-0.414, 0.948], mean_best_reward: --
 91284/100000: episode: 2521, duration: 0.193s, episode steps: 43, steps per second: 223, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.052 [-0.986, 1.756], mean_best_reward: --
 91323/100000: episode: 2522, duration: 0.177s, episode steps: 39, steps per second: 220, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.114 [-0.349, 1.058], mean_best_reward: --
 91385/100000: episode: 2523, duration: 0.282s, episode steps: 62, steps per second: 220, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.126 [-1.104, 0.585], mean_best_reward: --
 91399/100000: 

 92867/100000: episode: 2554, duration: 0.121s, episode steps: 25, steps per second: 206, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.066 [-1.030, 1.903], mean_best_reward: --
 92915/100000: episode: 2555, duration: 0.282s, episode steps: 48, steps per second: 170, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.051 [-0.416, 1.221], mean_best_reward: --
 92964/100000: episode: 2556, duration: 0.298s, episode steps: 49, steps per second: 165, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.408 [0.000, 1.000], mean observation: 0.026 [-1.905, 2.724], mean_best_reward: --
 93000/100000: episode: 2557, duration: 0.166s, episode steps: 36, steps per second: 217, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.097 [-0.318, 1.152], mean_best_reward: --
 93050/100000: e

 94494/100000: episode: 2588, duration: 0.681s, episode steps: 136, steps per second: 200, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.190 [-1.475, 1.095], mean_best_reward: --
 94530/100000: episode: 2589, duration: 0.201s, episode steps: 36, steps per second: 179, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.065 [-0.602, 0.962], mean_best_reward: --
 94554/100000: episode: 2590, duration: 0.120s, episode steps: 24, steps per second: 201, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: 0.117 [-0.372, 0.865], mean_best_reward: --
 94594/100000: episode: 2591, duration: 0.220s, episode steps: 40, steps per second: 182, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.575 [0.000, 1.000], mean observation: -0.040 [-2.217, 1.196], mean_best_reward: --
 94629/10000

 96290/100000: episode: 2622, duration: 0.136s, episode steps: 28, steps per second: 205, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.010 [-1.178, 0.831], mean_best_reward: --
 96331/100000: episode: 2623, duration: 0.187s, episode steps: 41, steps per second: 220, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.167 [-0.937, 0.554], mean_best_reward: --
 96416/100000: episode: 2624, duration: 0.378s, episode steps: 85, steps per second: 225, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: -0.097 [-0.930, 1.520], mean_best_reward: --
 96445/100000: episode: 2625, duration: 0.132s, episode steps: 29, steps per second: 220, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.071 [-0.934, 1.612], mean_best_reward: --
 96458/100000

 98163/100000: episode: 2657, duration: 0.242s, episode steps: 48, steps per second: 198, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-1.130, 0.708], mean_best_reward: --
 98193/100000: episode: 2658, duration: 0.158s, episode steps: 30, steps per second: 190, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.101 [-1.105, 0.622], mean_best_reward: --
 98251/100000: episode: 2659, duration: 0.300s, episode steps: 58, steps per second: 194, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.133 [-0.974, 0.537], mean_best_reward: --
 98322/100000: episode: 2660, duration: 0.341s, episode steps: 71, steps per second: 208, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.158 [-0.955, 0.642], mean_best_reward: --
 98429/10000

 99807/100000: episode: 2691, duration: 0.115s, episode steps: 21, steps per second: 182, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.098 [-1.001, 1.888], mean_best_reward: --
 99860/100000: episode: 2692, duration: 0.276s, episode steps: 53, steps per second: 192, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.107 [-0.944, 0.544], mean_best_reward: --
 99931/100000: episode: 2693, duration: 0.339s, episode steps: 71, steps per second: 209, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.146 [-1.229, 0.689], mean_best_reward: --
 99988/100000: episode: 2694, duration: 0.289s, episode steps: 57, steps per second: 197, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.145 [-0.720, 1.394], mean_best_reward: --
done, took 500

<tensorflow.python.keras.callbacks.History at 0x7fb3f4839550>

Saving the weights. 

In [47]:
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

Testing the model.

In [48]:
cem.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 8.000, steps: 8
Episode 2: reward: 11.000, steps: 11
Episode 3: reward: 9.000, steps: 9
Episode 4: reward: 9.000, steps: 9
Episode 5: reward: 11.000, steps: 11


<tensorflow.python.keras.callbacks.History at 0x7fee8392e460>