In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=20, visualize=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_4 (Activation)    (None, 16)                0         
__________

   564/100000: episode: 33, duration: 0.018s, episode steps: 10, steps per second: 553, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.130 [-2.551, 1.590], mean_best_reward: --
   575/100000: episode: 34, duration: 0.023s, episode steps: 11, steps per second: 480, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.139 [-1.325, 2.327], mean_best_reward: --
   595/100000: episode: 35, duration: 0.029s, episode steps: 20, steps per second: 678, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.078 [-1.525, 0.774], mean_best_reward: --
   615/100000: episode: 36, duration: 0.029s, episode steps: 20, steps per second: 683, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.087 [-1.785, 0.967], mean_best_reward: --
   629/100000: episod

  1397/100000: episode: 73, duration: 0.063s, episode steps: 51, steps per second: 805, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.627 [0.000, 1.000], mean observation: 0.044 [-3.052, 2.456], mean_best_reward: --
  1408/100000: episode: 74, duration: 0.018s, episode steps: 11, steps per second: 625, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.141 [-0.953, 1.801], mean_best_reward: --
  1417/100000: episode: 75, duration: 0.015s, episode steps: 9, steps per second: 600, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.149 [-1.740, 2.796], mean_best_reward: --
  1438/100000: episode: 76, duration: 0.031s, episode steps: 21, steps per second: 682, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.091 [-1.522, 2.486], mean_best_reward: --
  1447/100000: episode: 77

  1931/100000: episode: 109, duration: 0.017s, episode steps: 11, steps per second: 642, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.130 [-3.285, 2.182], mean_best_reward: --
  1946/100000: episode: 110, duration: 0.022s, episode steps: 15, steps per second: 690, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.098 [-1.916, 1.178], mean_best_reward: --
  1958/100000: episode: 111, duration: 0.019s, episode steps: 12, steps per second: 648, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.095 [-1.190, 1.983], mean_best_reward: --
  1967/100000: episode: 112, duration: 0.013s, episode steps: 9, steps per second: 670, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.152 [-2.866, 1.782], mean_best_reward: --
  1980/100000: epis

  2449/100000: episode: 143, duration: 0.018s, episode steps: 12, steps per second: 684, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.140 [-0.942, 1.784], mean_best_reward: --
  2459/100000: episode: 144, duration: 0.017s, episode steps: 10, steps per second: 575, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.122 [-1.902, 1.154], mean_best_reward: --
  2471/100000: episode: 145, duration: 0.018s, episode steps: 12, steps per second: 672, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.105 [-3.019, 1.974], mean_best_reward: --
  2481/100000: episode: 146, duration: 0.014s, episode steps: 10, steps per second: 704, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.151 [-2.288, 1.354], mean_best_reward: --
  2500/100000: ep

  2974/100000: episode: 176, duration: 0.024s, episode steps: 16, steps per second: 662, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.064 [-1.190, 1.680], mean_best_reward: --
  2986/100000: episode: 177, duration: 0.021s, episode steps: 12, steps per second: 583, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.095 [-2.023, 1.198], mean_best_reward: --
  2994/100000: episode: 178, duration: 0.014s, episode steps: 8, steps per second: 573, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.145 [-1.581, 2.541], mean_best_reward: --
  3010/100000: episode: 179, duration: 0.028s, episode steps: 16, steps per second: 568, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.077 [-2.675, 1.753], mean_best_reward: --
  3021/100000: episo

  3651/100000: episode: 211, duration: 0.022s, episode steps: 15, steps per second: 681, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.092 [-1.798, 2.814], mean_best_reward: --
  3679/100000: episode: 212, duration: 0.039s, episode steps: 28, steps per second: 716, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.075 [-1.295, 0.736], mean_best_reward: --
  3692/100000: episode: 213, duration: 0.018s, episode steps: 13, steps per second: 704, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.114 [-1.531, 2.442], mean_best_reward: --
  3703/100000: episode: 214, duration: 0.016s, episode steps: 11, steps per second: 669, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.134 [-1.166, 1.991], mean_best_reward: --
  3735/100000: epis

  4568/100000: episode: 246, duration: 0.045s, episode steps: 35, steps per second: 782, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: -0.055 [-1.446, 0.570], mean_best_reward: --
  4582/100000: episode: 247, duration: 0.020s, episode steps: 14, steps per second: 688, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.091 [-0.954, 1.473], mean_best_reward: --
  4612/100000: episode: 248, duration: 0.034s, episode steps: 30, steps per second: 892, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.433 [0.000, 1.000], mean observation: 0.065 [-0.843, 1.685], mean_best_reward: --
  4627/100000: episode: 249, duration: 0.023s, episode steps: 15, steps per second: 646, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.124 [-1.165, 0.559], mean_best_reward: --
  4644/100000: epi

  5110/100000: episode: 282, duration: 0.022s, episode steps: 15, steps per second: 691, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.100 [-0.815, 1.562], mean_best_reward: --
  5122/100000: episode: 283, duration: 0.020s, episode steps: 12, steps per second: 586, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.111 [-1.029, 1.608], mean_best_reward: --
  5134/100000: episode: 284, duration: 0.019s, episode steps: 12, steps per second: 646, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.095 [-1.581, 2.495], mean_best_reward: --
  5155/100000: episode: 285, duration: 0.032s, episode steps: 21, steps per second: 662, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.286 [0.000, 1.000], mean observation: 0.075 [-1.745, 2.802], mean_best_reward: --
  5174/100000: episo

  5787/100000: episode: 321, duration: 0.034s, episode steps: 23, steps per second: 670, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.348 [0.000, 1.000], mean observation: 0.088 [-1.343, 2.389], mean_best_reward: --
  5799/100000: episode: 322, duration: 0.021s, episode steps: 12, steps per second: 572, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.142 [-1.530, 2.586], mean_best_reward: --
  5810/100000: episode: 323, duration: 0.020s, episode steps: 11, steps per second: 543, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.112 [-1.173, 1.943], mean_best_reward: --
  5838/100000: episode: 324, duration: 0.041s, episode steps: 28, steps per second: 690, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.086 [-0.979, 0.570], mean_best_reward: --
  5853/100000: epis

  6285/100000: episode: 355, duration: 0.024s, episode steps: 16, steps per second: 671, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.084 [-1.007, 1.666], mean_best_reward: --
  6298/100000: episode: 356, duration: 0.022s, episode steps: 13, steps per second: 591, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.154 [0.000, 1.000], mean observation: 0.091 [-1.742, 2.749], mean_best_reward: --
  6313/100000: episode: 357, duration: 0.021s, episode steps: 15, steps per second: 726, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.083 [-2.276, 1.417], mean_best_reward: --
  6329/100000: episode: 358, duration: 0.022s, episode steps: 16, steps per second: 739, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.103 [-1.477, 0.754], mean_best_reward: --
  6373/100000: epi

  7050/100000: episode: 391, duration: 0.020s, episode steps: 16, steps per second: 787, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.099 [-2.667, 1.599], mean_best_reward: --
  7090/100000: episode: 392, duration: 0.049s, episode steps: 40, steps per second: 811, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.065 [-0.764, 1.540], mean_best_reward: --
  7117/100000: episode: 393, duration: 0.034s, episode steps: 27, steps per second: 806, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.046 [-1.058, 0.609], mean_best_reward: --
  7149/100000: episode: 394, duration: 0.043s, episode steps: 32, steps per second: 748, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.081 [-1.254, 0.712], mean_best_reward: --
  7160/100000: ep

  8129/100000: episode: 429, duration: 0.032s, episode steps: 24, steps per second: 760, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.117 [-0.983, 2.035], mean_best_reward: --
  8143/100000: episode: 430, duration: 0.023s, episode steps: 14, steps per second: 613, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.286 [0.000, 1.000], mean observation: 0.072 [-1.182, 1.930], mean_best_reward: --
  8171/100000: episode: 431, duration: 0.039s, episode steps: 28, steps per second: 710, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: 0.065 [-1.000, 1.819], mean_best_reward: --
  8195/100000: episode: 432, duration: 0.035s, episode steps: 24, steps per second: 680, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.062 [-1.531, 0.836], mean_best_reward: --
  8223/100000: epis

  9259/100000: episode: 463, duration: 0.033s, episode steps: 24, steps per second: 737, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.041 [-0.820, 1.385], mean_best_reward: --
  9374/100000: episode: 464, duration: 0.141s, episode steps: 115, steps per second: 818, episode reward: 115.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.461 [0.000, 1.000], mean observation: -0.449 [-2.545, 0.977], mean_best_reward: --
  9392/100000: episode: 465, duration: 0.027s, episode steps: 18, steps per second: 675, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.095 [-0.571, 1.116], mean_best_reward: --
  9400/100000: episode: 466, duration: 0.016s, episode steps: 8, steps per second: 516, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.133 [-2.543, 1.596], mean_best_reward: --
  9418/100000: epi

 10477/100000: episode: 497, duration: 0.105s, episode steps: 92, steps per second: 873, episode reward: 92.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.147 [-1.046, 1.843], mean_best_reward: --
 10505/100000: episode: 498, duration: 0.035s, episode steps: 28, steps per second: 794, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.081 [-1.023, 0.558], mean_best_reward: --
 10516/100000: episode: 499, duration: 0.016s, episode steps: 11, steps per second: 676, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.101 [-1.800, 1.169], mean_best_reward: --
 10557/100000: episode: 500, duration: 0.051s, episode steps: 41, steps per second: 811, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.096 [-0.861, 0.578], mean_best_reward: --
 10580/100000: ep

 11511/100000: episode: 534, duration: 0.063s, episode steps: 47, steps per second: 751, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.103 [-0.919, 0.391], mean_best_reward: --
 11527/100000: episode: 535, duration: 0.022s, episode steps: 16, steps per second: 712, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.082 [-1.139, 1.680], mean_best_reward: --
 11543/100000: episode: 536, duration: 0.023s, episode steps: 16, steps per second: 697, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.112 [-1.620, 0.753], mean_best_reward: --
 11556/100000: episode: 537, duration: 0.018s, episode steps: 13, steps per second: 716, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.094 [-0.800, 1.439], mean_best_reward: --
 11592/100000: epi

 12478/100000: episode: 570, duration: 0.019s, episode steps: 11, steps per second: 573, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.126 [-1.025, 1.800], mean_best_reward: --
 12499/100000: episode: 571, duration: 0.035s, episode steps: 21, steps per second: 608, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.077 [-0.996, 1.804], mean_best_reward: --
 12516/100000: episode: 572, duration: 0.027s, episode steps: 17, steps per second: 633, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.647 [0.000, 1.000], mean observation: -0.095 [-1.760, 0.966], mean_best_reward: --
 12532/100000: episode: 573, duration: 0.024s, episode steps: 16, steps per second: 666, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.098 [-1.558, 2.636], mean_best_reward: --
 12542/100000: epis

 13127/100000: episode: 604, duration: 0.038s, episode steps: 30, steps per second: 788, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.095 [-1.410, 0.560], mean_best_reward: --
 13140/100000: episode: 605, duration: 0.021s, episode steps: 13, steps per second: 625, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.154 [0.000, 1.000], mean observation: 0.117 [-1.728, 2.798], mean_best_reward: --
 13154/100000: episode: 606, duration: 0.020s, episode steps: 14, steps per second: 697, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.082 [-0.998, 1.689], mean_best_reward: --
 13164/100000: episode: 607, duration: 0.016s, episode steps: 10, steps per second: 614, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.136 [-1.161, 1.995], mean_best_reward: --
 13195/100000: epis

 14186/100000: episode: 637, duration: 0.035s, episode steps: 30, steps per second: 852, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.081 [-0.599, 1.386], mean_best_reward: --
 14204/100000: episode: 638, duration: 0.025s, episode steps: 18, steps per second: 717, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.118 [-1.523, 0.947], mean_best_reward: --
 14238/100000: episode: 639, duration: 0.042s, episode steps: 34, steps per second: 818, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.091 [-0.872, 0.400], mean_best_reward: --
 14260/100000: episode: 640, duration: 0.028s, episode steps: 22, steps per second: 783, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.065 [-0.757, 1.312], mean_best_reward: --
 14309/100000: epi

 15499/100000: episode: 674, duration: 0.025s, episode steps: 20, steps per second: 796, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.090 [-0.578, 1.127], mean_best_reward: --
 15606/100000: episode: 675, duration: 0.126s, episode steps: 107, steps per second: 846, episode reward: 107.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.216 [-1.512, 1.308], mean_best_reward: --
 15657/100000: episode: 676, duration: 0.061s, episode steps: 51, steps per second: 842, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.040 [-1.077, 0.591], mean_best_reward: --
 15699/100000: episode: 677, duration: 0.054s, episode steps: 42, steps per second: 779, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.053 [-1.230, 0.834], mean_best_reward: --
 15717/100000: 

 16654/100000: episode: 708, duration: 0.027s, episode steps: 20, steps per second: 749, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.083 [-1.208, 0.565], mean_best_reward: --
 16669/100000: episode: 709, duration: 0.025s, episode steps: 15, steps per second: 610, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.087 [-2.483, 1.570], mean_best_reward: --
 16679/100000: episode: 710, duration: 0.016s, episode steps: 10, steps per second: 620, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.135 [-2.162, 1.323], mean_best_reward: --
 16695/100000: episode: 711, duration: 0.021s, episode steps: 16, steps per second: 748, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.093 [-2.001, 1.201], mean_best_reward: --
 16769/100000: e

 18276/100000: episode: 746, duration: 0.048s, episode steps: 40, steps per second: 838, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.064 [-0.950, 1.565], mean_best_reward: --
 18315/100000: episode: 747, duration: 0.047s, episode steps: 39, steps per second: 836, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.074 [-1.038, 0.626], mean_best_reward: --
 18339/100000: episode: 748, duration: 0.030s, episode steps: 24, steps per second: 812, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.084 [-0.579, 1.348], mean_best_reward: --
 18412/100000: episode: 749, duration: 0.084s, episode steps: 73, steps per second: 873, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.133 [-1.518, 1.269], mean_best_reward: --
 18446/100000: epi

 19732/100000: episode: 779, duration: 0.052s, episode steps: 38, steps per second: 724, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.133 [-0.498, 1.788], mean_best_reward: --
 19769/100000: episode: 780, duration: 0.052s, episode steps: 37, steps per second: 714, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.136 [-0.838, 0.348], mean_best_reward: --
 19814/100000: episode: 781, duration: 0.056s, episode steps: 45, steps per second: 807, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.045 [-1.388, 0.809], mean_best_reward: --
 19903/100000: episode: 782, duration: 0.104s, episode steps: 89, steps per second: 860, episode reward: 89.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.070 [-1.070, 0.949], mean_best_reward: --
 19931/100000: epi

 21055/100000: episode: 814, duration: 0.029s, episode steps: 23, steps per second: 781, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.435 [0.000, 1.000], mean observation: 0.108 [-0.756, 1.727], mean_best_reward: --
 21069/100000: episode: 815, duration: 0.022s, episode steps: 14, steps per second: 649, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.143 [0.000, 1.000], mean observation: 0.096 [-1.968, 3.081], mean_best_reward: --
 21080/100000: episode: 816, duration: 0.017s, episode steps: 11, steps per second: 631, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.131 [-2.219, 1.375], mean_best_reward: --
 21100/100000: episode: 817, duration: 0.024s, episode steps: 20, steps per second: 823, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.099 [-0.568, 1.240], mean_best_reward: --
 21119/100000: epis

 21915/100000: episode: 851, duration: 0.031s, episode steps: 24, steps per second: 768, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.054 [-1.564, 2.610], mean_best_reward: 80.500000
 21926/100000: episode: 852, duration: 0.019s, episode steps: 11, steps per second: 584, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.107 [-1.551, 1.010], mean_best_reward: --
 21944/100000: episode: 853, duration: 0.022s, episode steps: 18, steps per second: 823, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.083 [-0.969, 1.461], mean_best_reward: --
 21973/100000: episode: 854, duration: 0.036s, episode steps: 29, steps per second: 805, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.088 [-1.035, 0.444], mean_best_reward: --
 22001/1000

 22673/100000: episode: 884, duration: 0.022s, episode steps: 17, steps per second: 756, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: -0.084 [-1.465, 1.021], mean_best_reward: --
 22691/100000: episode: 885, duration: 0.026s, episode steps: 18, steps per second: 699, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.114 [-1.014, 0.551], mean_best_reward: --
 22719/100000: episode: 886, duration: 0.033s, episode steps: 28, steps per second: 836, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.073 [-1.358, 0.561], mean_best_reward: --
 22742/100000: episode: 887, duration: 0.032s, episode steps: 23, steps per second: 719, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.609 [0.000, 1.000], mean observation: -0.069 [-1.733, 0.968], mean_best_reward: --
 22751/100000: e

 23765/100000: episode: 923, duration: 0.031s, episode steps: 25, steps per second: 801, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.062 [-1.387, 0.648], mean_best_reward: --
 23787/100000: episode: 924, duration: 0.030s, episode steps: 22, steps per second: 735, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.065 [-1.432, 0.971], mean_best_reward: --
 23810/100000: episode: 925, duration: 0.031s, episode steps: 23, steps per second: 738, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.084 [-0.620, 1.173], mean_best_reward: --
 23819/100000: episode: 926, duration: 0.016s, episode steps: 9, steps per second: 576, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.129 [-2.258, 1.380], mean_best_reward: --
 23843/100000: epis

 25099/100000: episode: 961, duration: 0.153s, episode steps: 131, steps per second: 854, episode reward: 131.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.435 [0.000, 1.000], mean observation: -0.410 [-3.153, 0.863], mean_best_reward: --
 25110/100000: episode: 962, duration: 0.017s, episode steps: 11, steps per second: 665, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.118 [-0.950, 1.771], mean_best_reward: --
 25156/100000: episode: 963, duration: 0.051s, episode steps: 46, steps per second: 897, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.110 [-1.946, 1.994], mean_best_reward: --
 25180/100000: episode: 964, duration: 0.032s, episode steps: 24, steps per second: 762, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.061 [-0.816, 1.270], mean_best_reward: --
 25207/100000: e

 26906/100000: episode: 998, duration: 0.036s, episode steps: 29, steps per second: 813, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.075 [-0.602, 0.956], mean_best_reward: --
 26928/100000: episode: 999, duration: 0.030s, episode steps: 22, steps per second: 724, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.039 [-1.530, 0.965], mean_best_reward: --
 27022/100000: episode: 1000, duration: 0.106s, episode steps: 94, steps per second: 889, episode reward: 94.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: -0.008 [-0.916, 0.892], mean_best_reward: --
 27037/100000: episode: 1001, duration: 0.021s, episode steps: 15, steps per second: 728, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.074 [-1.825, 1.207], mean_best_reward: 85.500000
 27182/1

 28689/100000: episode: 1032, duration: 0.027s, episode steps: 22, steps per second: 813, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.080 [-0.769, 1.401], mean_best_reward: --
 28780/100000: episode: 1033, duration: 0.106s, episode steps: 91, steps per second: 856, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.157 [-1.529, 1.115], mean_best_reward: --
 28823/100000: episode: 1034, duration: 0.056s, episode steps: 43, steps per second: 771, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.150 [-0.689, 1.108], mean_best_reward: --
 28943/100000: episode: 1035, duration: 0.154s, episode steps: 120, steps per second: 778, episode reward: 120.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.283 [-1.062, 1.678], mean_best_reward: --
 29124/100000

 30804/100000: episode: 1069, duration: 0.079s, episode steps: 56, steps per second: 712, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: -0.183 [-2.264, 1.090], mean_best_reward: --
 30842/100000: episode: 1070, duration: 0.048s, episode steps: 38, steps per second: 786, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.079 [-0.475, 1.107], mean_best_reward: --
 30874/100000: episode: 1071, duration: 0.039s, episode steps: 32, steps per second: 824, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.030 [-1.005, 0.621], mean_best_reward: --
 30897/100000: episode: 1072, duration: 0.028s, episode steps: 23, steps per second: 825, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.090 [-1.155, 0.448], mean_best_reward: --
 30909/100000

 31732/100000: episode: 1105, duration: 0.055s, episode steps: 41, steps per second: 748, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.102 [-0.501, 1.553], mean_best_reward: --
 31767/100000: episode: 1106, duration: 0.043s, episode steps: 35, steps per second: 817, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.026 [-0.781, 1.172], mean_best_reward: --
 31860/100000: episode: 1107, duration: 0.107s, episode steps: 93, steps per second: 869, episode reward: 93.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: -0.034 [-0.943, 1.031], mean_best_reward: --
 31925/100000: episode: 1108, duration: 0.083s, episode steps: 65, steps per second: 784, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.046 [-1.191, 1.026], mean_best_reward: --
 31949/100000:

 33054/100000: episode: 1139, duration: 0.045s, episode steps: 32, steps per second: 719, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: 0.038 [-0.578, 1.267], mean_best_reward: --
 33119/100000: episode: 1140, duration: 0.085s, episode steps: 65, steps per second: 765, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.056 [-1.317, 1.188], mean_best_reward: --
 33135/100000: episode: 1141, duration: 0.022s, episode steps: 16, steps per second: 713, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.088 [-1.445, 0.959], mean_best_reward: --
 33169/100000: episode: 1142, duration: 0.050s, episode steps: 34, steps per second: 679, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.103 [-0.384, 1.186], mean_best_reward: --
 33194/100000: 

 34696/100000: episode: 1174, duration: 0.086s, episode steps: 73, steps per second: 846, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.006 [-0.627, 0.935], mean_best_reward: --
 34806/100000: episode: 1175, duration: 0.131s, episode steps: 110, steps per second: 839, episode reward: 110.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.445 [0.000, 1.000], mean observation: -0.331 [-2.581, 1.362], mean_best_reward: --
 34825/100000: episode: 1176, duration: 0.026s, episode steps: 19, steps per second: 721, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.737 [0.000, 1.000], mean observation: -0.059 [-2.681, 1.772], mean_best_reward: --
 34872/100000: episode: 1177, duration: 0.058s, episode steps: 47, steps per second: 809, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.068 [-0.977, 0.604], mean_best_reward: --
 34887/1000

 36155/100000: episode: 1209, duration: 0.110s, episode steps: 91, steps per second: 827, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: 0.096 [-1.022, 2.636], mean_best_reward: --
 36268/100000: episode: 1210, duration: 0.132s, episode steps: 113, steps per second: 859, episode reward: 113.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.504 [0.000, 1.000], mean observation: -0.092 [-1.108, 0.993], mean_best_reward: --
 36309/100000: episode: 1211, duration: 0.049s, episode steps: 41, steps per second: 845, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: 0.016 [-0.960, 1.554], mean_best_reward: --
 36354/100000: episode: 1212, duration: 0.058s, episode steps: 45, steps per second: 776, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.149 [-0.628, 1.273], mean_best_reward: --
 36381/100000

 37879/100000: episode: 1243, duration: 0.097s, episode steps: 74, steps per second: 762, episode reward: 74.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.066 [-1.796, 1.132], mean_best_reward: --
 37919/100000: episode: 1244, duration: 0.056s, episode steps: 40, steps per second: 709, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.087 [-1.195, 0.576], mean_best_reward: --
 37930/100000: episode: 1245, duration: 0.017s, episode steps: 11, steps per second: 658, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.105 [-1.412, 2.241], mean_best_reward: --
 37979/100000: episode: 1246, duration: 0.061s, episode steps: 49, steps per second: 797, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.167 [-0.844, 1.231], mean_best_reward: --
 38079/100000: 

 39871/100000: episode: 1279, duration: 0.066s, episode steps: 62, steps per second: 945, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.241 [-1.690, 0.686], mean_best_reward: --
 39895/100000: episode: 1280, duration: 0.031s, episode steps: 24, steps per second: 784, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.101 [-1.049, 0.549], mean_best_reward: --
 40006/100000: episode: 1281, duration: 0.132s, episode steps: 111, steps per second: 843, episode reward: 111.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: 0.211 [-0.865, 1.446], mean_best_reward: --
 40076/100000: episode: 1282, duration: 0.083s, episode steps: 70, steps per second: 848, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.414 [0.000, 1.000], mean observation: -0.295 [-2.607, 1.815], mean_best_reward: --
 40093/1000

 41919/100000: episode: 1314, duration: 0.036s, episode steps: 31, steps per second: 858, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.548 [0.000, 1.000], mean observation: 0.099 [-0.551, 1.026], mean_best_reward: --
 42055/100000: episode: 1315, duration: 0.145s, episode steps: 136, steps per second: 940, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.134 [-1.067, 1.484], mean_best_reward: --
 42102/100000: episode: 1316, duration: 0.049s, episode steps: 47, steps per second: 951, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.003 [-0.615, 0.926], mean_best_reward: --
 42142/100000: episode: 1317, duration: 0.046s, episode steps: 40, steps per second: 863, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.042 [-0.801, 1.140], mean_best_reward: --
 42183/100000:

 43506/100000: episode: 1353, duration: 0.127s, episode steps: 108, steps per second: 848, episode reward: 108.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.172 [-1.090, 0.871], mean_best_reward: --
 43549/100000: episode: 1354, duration: 0.049s, episode steps: 43, steps per second: 871, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.053 [-0.781, 0.593], mean_best_reward: --
 43596/100000: episode: 1355, duration: 0.051s, episode steps: 47, steps per second: 914, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: 0.029 [-1.240, 0.933], mean_best_reward: --
 43687/100000: episode: 1356, duration: 0.099s, episode steps: 91, steps per second: 915, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: -0.314 [-2.671, 2.063], mean_best_reward: --
 43771/1000

 45404/100000: episode: 1387, duration: 0.247s, episode steps: 200, steps per second: 810, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.140 [-1.481, 1.328], mean_best_reward: --
 45471/100000: episode: 1388, duration: 0.086s, episode steps: 67, steps per second: 780, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.433 [0.000, 1.000], mean observation: -0.171 [-2.072, 1.517], mean_best_reward: --
 45524/100000: episode: 1389, duration: 0.067s, episode steps: 53, steps per second: 791, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.160 [-1.002, 0.498], mean_best_reward: --
 45586/100000: episode: 1390, duration: 0.083s, episode steps: 62, steps per second: 746, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.128 [-1.727, 0.455], mean_best_reward: --
 45627/100

 46876/100000: episode: 1421, duration: 0.111s, episode steps: 87, steps per second: 783, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.126 [-1.268, 0.730], mean_best_reward: --
 46888/100000: episode: 1422, duration: 0.018s, episode steps: 12, steps per second: 674, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.111 [-1.208, 1.830], mean_best_reward: --
 46908/100000: episode: 1423, duration: 0.026s, episode steps: 20, steps per second: 778, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.073 [-1.018, 1.642], mean_best_reward: --
 46931/100000: episode: 1424, duration: 0.032s, episode steps: 23, steps per second: 713, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: 0.116 [-0.419, 1.134], mean_best_reward: --
 47033/100000: 

 48362/100000: episode: 1455, duration: 0.047s, episode steps: 33, steps per second: 709, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.076 [-0.746, 1.094], mean_best_reward: --
 48375/100000: episode: 1456, duration: 0.019s, episode steps: 13, steps per second: 697, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.101 [-1.753, 0.986], mean_best_reward: --
 48386/100000: episode: 1457, duration: 0.017s, episode steps: 11, steps per second: 641, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.124 [-1.393, 2.324], mean_best_reward: --
 48485/100000: episode: 1458, duration: 0.119s, episode steps: 99, steps per second: 833, episode reward: 99.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.566 [0.000, 1.000], mean observation: 0.195 [-1.967, 2.587], mean_best_reward: --
 48594/100000: 

 50173/100000: episode: 1490, duration: 0.060s, episode steps: 44, steps per second: 736, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.144 [-0.484, 0.958], mean_best_reward: --
 50186/100000: episode: 1491, duration: 0.022s, episode steps: 13, steps per second: 584, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.111 [-0.783, 1.354], mean_best_reward: --
 50280/100000: episode: 1492, duration: 0.120s, episode steps: 94, steps per second: 785, episode reward: 94.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.010 [-1.145, 1.116], mean_best_reward: --
 50306/100000: episode: 1493, duration: 0.034s, episode steps: 26, steps per second: 774, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.577 [0.000, 1.000], mean observation: -0.076 [-1.951, 1.146], mean_best_reward: --
 50334/100000: 

 51964/100000: episode: 1526, duration: 0.123s, episode steps: 106, steps per second: 860, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.014 [-1.611, 1.213], mean_best_reward: --
 52001/100000: episode: 1527, duration: 0.044s, episode steps: 37, steps per second: 838, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.106 [-0.890, 0.277], mean_best_reward: --
 52112/100000: episode: 1528, duration: 0.130s, episode steps: 111, steps per second: 856, episode reward: 111.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.014 [-0.944, 1.617], mean_best_reward: --
 52173/100000: episode: 1529, duration: 0.073s, episode steps: 61, steps per second: 830, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.443 [0.000, 1.000], mean observation: -0.092 [-1.387, 1.626], mean_best_reward: --
 52219/10

 53663/100000: episode: 1563, duration: 0.055s, episode steps: 44, steps per second: 800, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.066 [-0.895, 0.588], mean_best_reward: --
 53820/100000: episode: 1564, duration: 0.188s, episode steps: 157, steps per second: 834, episode reward: 157.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.061 [-1.669, 1.243], mean_best_reward: --
 53891/100000: episode: 1565, duration: 0.085s, episode steps: 71, steps per second: 834, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.082 [-1.213, 0.663], mean_best_reward: --
 53942/100000: episode: 1566, duration: 0.061s, episode steps: 51, steps per second: 837, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.451 [0.000, 1.000], mean observation: -0.132 [-0.911, 0.622], mean_best_reward: --
 53986/100

 55449/100000: episode: 1597, duration: 0.050s, episode steps: 40, steps per second: 798, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.099 [-0.893, 0.385], mean_best_reward: --
 55569/100000: episode: 1598, duration: 0.137s, episode steps: 120, steps per second: 875, episode reward: 120.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.035 [-1.822, 0.772], mean_best_reward: --
 55605/100000: episode: 1599, duration: 0.043s, episode steps: 36, steps per second: 841, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.079 [-1.127, 0.792], mean_best_reward: --
 55673/100000: episode: 1600, duration: 0.077s, episode steps: 68, steps per second: 882, episode reward: 68.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.426 [0.000, 1.000], mean observation: -0.176 [-2.099, 1.990], mean_best_reward: --
 55715/100

 57434/100000: episode: 1630, duration: 0.102s, episode steps: 84, steps per second: 823, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.112 [-2.078, 1.714], mean_best_reward: --
 57487/100000: episode: 1631, duration: 0.067s, episode steps: 53, steps per second: 795, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.003 [-0.799, 1.024], mean_best_reward: --
 57514/100000: episode: 1632, duration: 0.035s, episode steps: 27, steps per second: 770, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.091 [-1.239, 0.570], mean_best_reward: --
 57524/100000: episode: 1633, duration: 0.019s, episode steps: 10, steps per second: 533, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.106 [-1.334, 0.810], mean_best_reward: --
 57536/100000

 59244/100000: episode: 1667, duration: 0.036s, episode steps: 27, steps per second: 755, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.069 [-1.506, 0.815], mean_best_reward: --
 59275/100000: episode: 1668, duration: 0.040s, episode steps: 31, steps per second: 767, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.025 [-0.989, 1.311], mean_best_reward: --
 59293/100000: episode: 1669, duration: 0.024s, episode steps: 18, steps per second: 757, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.072 [-0.810, 1.210], mean_best_reward: --
 59325/100000: episode: 1670, duration: 0.039s, episode steps: 32, steps per second: 826, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.132 [-0.878, 0.563], mean_best_reward: --
 59418/100000:

 60978/100000: episode: 1702, duration: 0.106s, episode steps: 87, steps per second: 821, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.142 [-1.329, 1.297], mean_best_reward: --
 61011/100000: episode: 1703, duration: 0.042s, episode steps: 33, steps per second: 785, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.120 [-0.541, 1.130], mean_best_reward: --
 61023/100000: episode: 1704, duration: 0.018s, episode steps: 12, steps per second: 674, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.116 [-1.689, 1.025], mean_best_reward: --
 61146/100000: episode: 1705, duration: 0.144s, episode steps: 123, steps per second: 857, episode reward: 123.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.045 [-0.948, 0.794], mean_best_reward: --
 61254/100000

 62657/100000: episode: 1737, duration: 0.036s, episode steps: 31, steps per second: 851, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.119 [-0.789, 1.188], mean_best_reward: --
 62729/100000: episode: 1738, duration: 0.090s, episode steps: 72, steps per second: 802, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.010 [-0.971, 0.778], mean_best_reward: --
 62831/100000: episode: 1739, duration: 0.119s, episode steps: 102, steps per second: 859, episode reward: 102.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.461 [0.000, 1.000], mean observation: -0.217 [-1.746, 1.306], mean_best_reward: --
 62865/100000: episode: 1740, duration: 0.045s, episode steps: 34, steps per second: 750, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.074 [-1.172, 0.812], mean_best_reward: --
 62908/1000

 64758/100000: episode: 1770, duration: 0.051s, episode steps: 41, steps per second: 803, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: -0.004 [-0.811, 0.941], mean_best_reward: --
 64794/100000: episode: 1771, duration: 0.053s, episode steps: 36, steps per second: 682, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.639 [0.000, 1.000], mean observation: 0.014 [-2.574, 1.893], mean_best_reward: --
 64820/100000: episode: 1772, duration: 0.036s, episode steps: 26, steps per second: 716, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.098 [-0.579, 1.464], mean_best_reward: --
 64908/100000: episode: 1773, duration: 0.109s, episode steps: 88, steps per second: 807, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.105 [-0.998, 1.076], mean_best_reward: --
 64952/100000: 

 66478/100000: episode: 1804, duration: 0.048s, episode steps: 34, steps per second: 707, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.079 [-0.615, 0.866], mean_best_reward: --
 66496/100000: episode: 1805, duration: 0.030s, episode steps: 18, steps per second: 599, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.074 [-0.640, 1.292], mean_best_reward: --
 66527/100000: episode: 1806, duration: 0.040s, episode steps: 31, steps per second: 784, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.065 [-0.451, 1.026], mean_best_reward: --
 66542/100000: episode: 1807, duration: 0.021s, episode steps: 15, steps per second: 699, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.064 [-1.814, 1.218], mean_best_reward: --
 66561/100000: 

 68246/100000: episode: 1840, duration: 0.031s, episode steps: 23, steps per second: 749, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.111 [-0.416, 0.965], mean_best_reward: --
 68281/100000: episode: 1841, duration: 0.048s, episode steps: 35, steps per second: 722, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: -0.111 [-0.766, 0.405], mean_best_reward: --
 68340/100000: episode: 1842, duration: 0.069s, episode steps: 59, steps per second: 856, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: 0.080 [-1.351, 1.143], mean_best_reward: --
 68382/100000: episode: 1843, duration: 0.052s, episode steps: 42, steps per second: 801, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.135 [-0.521, 1.253], mean_best_reward: --
 68454/100000: 

 69831/100000: episode: 1875, duration: 0.143s, episode steps: 118, steps per second: 827, episode reward: 118.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.022 [-1.086, 0.789], mean_best_reward: --
 69962/100000: episode: 1876, duration: 0.145s, episode steps: 131, steps per second: 905, episode reward: 131.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.313 [-2.223, 1.238], mean_best_reward: --
 69989/100000: episode: 1877, duration: 0.033s, episode steps: 27, steps per second: 828, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.059 [-1.646, 0.815], mean_best_reward: --
 70001/100000: episode: 1878, duration: 0.020s, episode steps: 12, steps per second: 608, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.128 [-0.769, 1.572], mean_best_reward: --
 70069/10

 72024/100000: episode: 1910, duration: 0.071s, episode steps: 57, steps per second: 807, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.170 [-0.742, 1.317], mean_best_reward: --
 72096/100000: episode: 1911, duration: 0.086s, episode steps: 72, steps per second: 840, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.008 [-1.204, 1.932], mean_best_reward: --
 72124/100000: episode: 1912, duration: 0.034s, episode steps: 28, steps per second: 820, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.073 [-1.165, 0.607], mean_best_reward: --
 72206/100000: episode: 1913, duration: 0.099s, episode steps: 82, steps per second: 831, episode reward: 82.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.085 [-1.279, 1.020], mean_best_reward: --
 72244/100000:

 73998/100000: episode: 1944, duration: 0.142s, episode steps: 115, steps per second: 811, episode reward: 115.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.242 [-2.083, 1.187], mean_best_reward: --
 74064/100000: episode: 1945, duration: 0.082s, episode steps: 66, steps per second: 809, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.008 [-1.191, 1.006], mean_best_reward: --
 74152/100000: episode: 1946, duration: 0.107s, episode steps: 88, steps per second: 822, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.229 [-1.458, 0.645], mean_best_reward: --
 74182/100000: episode: 1947, duration: 0.040s, episode steps: 30, steps per second: 751, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.053 [-1.378, 0.598], mean_best_reward: --
 74218/100

 75881/100000: episode: 1977, duration: 0.106s, episode steps: 93, steps per second: 880, episode reward: 93.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.121 [-1.111, 1.103], mean_best_reward: --
 75927/100000: episode: 1978, duration: 0.055s, episode steps: 46, steps per second: 829, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.076 [-1.257, 0.743], mean_best_reward: --
 76030/100000: episode: 1979, duration: 0.117s, episode steps: 103, steps per second: 884, episode reward: 103.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.021 [-1.126, 0.771], mean_best_reward: --
 76098/100000: episode: 1980, duration: 0.083s, episode steps: 68, steps per second: 822, episode reward: 68.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.084 [-0.632, 1.418], mean_best_reward: --
 76138/100000

 78011/100000: episode: 2011, duration: 0.058s, episode steps: 43, steps per second: 747, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.189 [-1.301, 0.586], mean_best_reward: --
 78097/100000: episode: 2012, duration: 0.105s, episode steps: 86, steps per second: 817, episode reward: 86.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.200 [-1.206, 1.543], mean_best_reward: --
 78142/100000: episode: 2013, duration: 0.056s, episode steps: 45, steps per second: 803, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.073 [-0.868, 0.553], mean_best_reward: --
 78179/100000: episode: 2014, duration: 0.047s, episode steps: 37, steps per second: 791, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.039 [-1.174, 0.608], mean_best_reward: --
 78195/100000

 79791/100000: episode: 2048, duration: 0.049s, episode steps: 37, steps per second: 757, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.049 [-0.748, 1.114], mean_best_reward: --
 79813/100000: episode: 2049, duration: 0.033s, episode steps: 22, steps per second: 665, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.071 [-0.583, 1.112], mean_best_reward: --
 79856/100000: episode: 2050, duration: 0.056s, episode steps: 43, steps per second: 763, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.086 [-1.068, 0.583], mean_best_reward: --
 79962/100000: episode: 2051, duration: 0.139s, episode steps: 106, steps per second: 761, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.166 [-1.015, 1.162], mean_best_reward: 123.000000
 8011

 81946/100000: episode: 2082, duration: 0.135s, episode steps: 109, steps per second: 810, episode reward: 109.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.359 [-2.253, 1.227], mean_best_reward: --
 82061/100000: episode: 2083, duration: 0.145s, episode steps: 115, steps per second: 793, episode reward: 115.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.461 [0.000, 1.000], mean observation: -0.208 [-2.002, 1.097], mean_best_reward: --
 82152/100000: episode: 2084, duration: 0.121s, episode steps: 91, steps per second: 752, episode reward: 91.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.085 [-1.349, 1.577], mean_best_reward: --
 82235/100000: episode: 2085, duration: 0.100s, episode steps: 83, steps per second: 830, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.019 [-0.984, 1.223], mean_best_reward: --
 82265/10

 84424/100000: episode: 2115, duration: 0.077s, episode steps: 61, steps per second: 797, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.177 [-1.102, 0.733], mean_best_reward: --
 84496/100000: episode: 2116, duration: 0.093s, episode steps: 72, steps per second: 773, episode reward: 72.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.066 [-1.143, 1.200], mean_best_reward: --
 84535/100000: episode: 2117, duration: 0.054s, episode steps: 39, steps per second: 726, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.123 [-0.720, 0.404], mean_best_reward: --
 84563/100000: episode: 2118, duration: 0.039s, episode steps: 28, steps per second: 720, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.126 [-0.964, 0.574], mean_best_reward: --
 84626/100000

 86639/100000: episode: 2150, duration: 0.029s, episode steps: 22, steps per second: 752, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.112 [-1.370, 0.571], mean_best_reward: --
 86684/100000: episode: 2151, duration: 0.059s, episode steps: 45, steps per second: 768, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.028 [-1.232, 0.622], mean_best_reward: 118.500000
 86795/100000: episode: 2152, duration: 0.136s, episode steps: 111, steps per second: 817, episode reward: 111.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: 0.208 [-1.506, 2.284], mean_best_reward: --
 86829/100000: episode: 2153, duration: 0.043s, episode steps: 34, steps per second: 787, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.082 [-0.563, 1.354], mean_best_reward: --
 868

 89097/100000: episode: 2184, duration: 0.046s, episode steps: 40, steps per second: 875, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.074 [-1.123, 0.731], mean_best_reward: --
 89180/100000: episode: 2185, duration: 0.105s, episode steps: 83, steps per second: 789, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.566 [0.000, 1.000], mean observation: 0.251 [-1.303, 2.215], mean_best_reward: --
 89249/100000: episode: 2186, duration: 0.079s, episode steps: 69, steps per second: 870, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.035 [-0.630, 1.026], mean_best_reward: --
 89308/100000: episode: 2187, duration: 0.078s, episode steps: 59, steps per second: 758, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.052 [-0.570, 1.013], mean_best_reward: --
 89327/100000: 

 91089/100000: episode: 2218, duration: 0.182s, episode steps: 145, steps per second: 796, episode reward: 145.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.137 [-1.862, 1.121], mean_best_reward: --
 91176/100000: episode: 2219, duration: 0.112s, episode steps: 87, steps per second: 778, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.207 [-1.664, 1.032], mean_best_reward: --
 91277/100000: episode: 2220, duration: 0.120s, episode steps: 101, steps per second: 844, episode reward: 101.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.436 [0.000, 1.000], mean observation: -0.441 [-2.414, 1.271], mean_best_reward: --
 91382/100000: episode: 2221, duration: 0.123s, episode steps: 105, steps per second: 852, episode reward: 105.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.084 [-1.110, 0.649], mean_best_reward: --
 91405

 93057/100000: episode: 2251, duration: 0.157s, episode steps: 139, steps per second: 886, episode reward: 139.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.547 [0.000, 1.000], mean observation: 0.386 [-1.869, 2.862], mean_best_reward: 146.500000
 93220/100000: episode: 2252, duration: 0.184s, episode steps: 163, steps per second: 884, episode reward: 163.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: -0.125 [-1.134, 1.131], mean_best_reward: --
 93244/100000: episode: 2253, duration: 0.031s, episode steps: 24, steps per second: 769, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.056 [-1.269, 0.836], mean_best_reward: --
 93331/100000: episode: 2254, duration: 0.102s, episode steps: 87, steps per second: 853, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: -0.009 [-0.780, 1.036], mean_best_reward: --
 

 95392/100000: episode: 2286, duration: 0.218s, episode steps: 184, steps per second: 844, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.016 [-1.122, 0.965], mean_best_reward: --
 95438/100000: episode: 2287, duration: 0.056s, episode steps: 46, steps per second: 815, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.565 [0.000, 1.000], mean observation: 0.180 [-0.812, 1.265], mean_best_reward: --
 95472/100000: episode: 2288, duration: 0.043s, episode steps: 34, steps per second: 793, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.192 [-0.744, 1.230], mean_best_reward: --
 95519/100000: episode: 2289, duration: 0.054s, episode steps: 47, steps per second: 870, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.060 [-0.926, 0.827], mean_best_reward: --
 95534/100000

 97373/100000: episode: 2322, duration: 0.111s, episode steps: 98, steps per second: 884, episode reward: 98.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.238 [-1.678, 1.107], mean_best_reward: --
 97457/100000: episode: 2323, duration: 0.099s, episode steps: 84, steps per second: 853, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.218 [-1.701, 1.126], mean_best_reward: --
 97613/100000: episode: 2324, duration: 0.177s, episode steps: 156, steps per second: 882, episode reward: 156.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: 0.186 [-1.043, 1.462], mean_best_reward: --
 97663/100000: episode: 2325, duration: 0.058s, episode steps: 50, steps per second: 857, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.125 [-0.984, 0.435], mean_best_reward: --
 97673/1000

 99815/100000: episode: 2356, duration: 0.180s, episode steps: 153, steps per second: 848, episode reward: 153.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.359 [-1.403, 3.203], mean_best_reward: --
 99884/100000: episode: 2357, duration: 0.081s, episode steps: 69, steps per second: 847, episode reward: 69.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.252 [-1.591, 0.767], mean_best_reward: --
 99899/100000: episode: 2358, duration: 0.021s, episode steps: 15, steps per second: 706, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.119 [-1.297, 0.748], mean_best_reward: --
done, took 127.794 seconds
Testing for 20 episodes ...
Episode 1: reward: 141.000, steps: 141
Episode 2: reward: 148.000, steps: 148
Episode 3: reward: 141.000, steps: 141
Episode 4: reward: 136.000, steps: 136
Episode 5: reward: 140.000, steps: 140
Episode 

<keras.callbacks.History at 0x15f5de82978>