In [17]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

import time

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[2017-10-23 14:38:52,183] Making new env: CartPole-v0


In [9]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_6 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_7 (Activation)    (None, 16)                0         
__________

In [10]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [12]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...




    13/50000: episode: 1, duration: 1.001s, episode steps: 13, steps per second: 13, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.097 [-1.187, 1.795], loss: 0.529511, mean_absolute_error: 0.583668, mean_q: 0.156393




    25/50000: episode: 2, duration: 0.202s, episode steps: 12, steps per second: 59, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.155 [-1.136, 2.146], loss: 0.444032, mean_absolute_error: 0.574376, mean_q: 0.231963


    40/50000: episode: 3, duration: 0.244s, episode steps: 15, steps per second: 61, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.119 [-0.938, 1.797], loss: 0.323213, mean_absolute_error: 0.585834, mean_q: 0.395081


    53/50000: episode: 4, duration: 0.214s, episode steps: 13, steps per second: 61, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.108 [-2.481, 1.544], loss: 0.184119, mean_absolute_error: 0.569002, mean_q: 0.606931
    62/50000: episode: 5, duration: 0.150s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.778 [0.000, 1.000], mean observation: -0.124 [-1.752, 1.023], loss: 0.140096, mean_absolute_error: 0.588662, mean_q: 0.774150


    76/50000: episode: 6, duration: 0.231s, episode steps: 14, steps per second: 61, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.083 [-1.911, 1.185], loss: 0.114886, mean_absolute_error: 0.614223, mean_q: 0.879863


    98/50000: episode: 7, duration: 0.366s, episode steps: 22, steps per second: 60, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.096 [-0.738, 1.400], loss: 0.073853, mean_absolute_error: 0.663707, mean_q: 1.089271


   130/50000: episode: 8, duration: 0.532s, episode steps: 32, steps per second: 60, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.562 [0.000, 1.000], mean observation: -0.066 [-1.804, 0.803], loss: 0.048574, mean_absolute_error: 0.772799, mean_q: 1.389310
   141/50000: episode: 9, duration: 0.189s, episode steps: 11, steps per second: 58, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.094 [-1.673, 1.030], loss: 0.036445, mean_absolute_error: 0.807438, mean_q: 1.523375


   154/50000: episode: 10, duration: 0.210s, episode steps: 13, steps per second: 62, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.106 [-2.179, 1.355], loss: 0.038421, mean_absolute_error: 0.848688, mean_q: 1.611370


   173/50000: episode: 11, duration: 0.316s, episode steps: 19, steps per second: 60, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.368 [0.000, 1.000], mean observation: 0.077 [-1.016, 1.793], loss: 0.042062, mean_absolute_error: 0.896645, mean_q: 1.704488
   185/50000: episode: 12, duration: 0.199s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.120 [-1.136, 1.902], loss: 0.031548, mean_absolute_error: 0.948664, mean_q: 1.860603


   197/50000: episode: 13, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.096 [-1.752, 1.027], loss: 0.054760, mean_absolute_error: 1.004980, mean_q: 1.945584


   225/50000: episode: 14, duration: 0.467s, episode steps: 28, steps per second: 60, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.023 [-1.539, 2.266], loss: 0.056043, mean_absolute_error: 1.072361, mean_q: 2.057847


   241/50000: episode: 15, duration: 0.266s, episode steps: 16, steps per second: 60, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.094 [-1.472, 0.787], loss: 0.065932, mean_absolute_error: 1.161924, mean_q: 2.227792


   258/50000: episode: 16, duration: 0.284s, episode steps: 17, steps per second: 60, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.084 [-0.977, 1.711], loss: 0.070361, mean_absolute_error: 1.224721, mean_q: 2.351013


   275/50000: episode: 17, duration: 0.282s, episode steps: 17, steps per second: 60, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.353 [0.000, 1.000], mean observation: 0.072 [-1.162, 1.951], loss: 0.074882, mean_absolute_error: 1.282310, mean_q: 2.461832


   295/50000: episode: 18, duration: 0.334s, episode steps: 20, steps per second: 60, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.074 [-0.985, 1.446], loss: 0.059444, mean_absolute_error: 1.367801, mean_q: 2.666522


   308/50000: episode: 19, duration: 0.216s, episode steps: 13, steps per second: 60, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.121 [-1.154, 1.958], loss: 0.090820, mean_absolute_error: 1.459547, mean_q: 2.794949


   332/50000: episode: 20, duration: 0.420s, episode steps: 24, steps per second: 57, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.096 [-0.414, 1.254], loss: 0.088321, mean_absolute_error: 1.520937, mean_q: 2.982930


   347/50000: episode: 21, duration: 0.263s, episode steps: 15, steps per second: 57, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.120 [-0.773, 1.533], loss: 0.095194, mean_absolute_error: 1.600515, mean_q: 3.090780


   370/50000: episode: 22, duration: 0.399s, episode steps: 23, steps per second: 58, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.565 [0.000, 1.000], mean observation: -0.066 [-1.493, 0.783], loss: 0.119231, mean_absolute_error: 1.685561, mean_q: 3.226659


   397/50000: episode: 23, duration: 0.458s, episode steps: 27, steps per second: 59, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.117 [-0.577, 1.082], loss: 0.118190, mean_absolute_error: 1.790801, mean_q: 3.410290


   477/50000: episode: 24, duration: 1.556s, episode steps: 80, steps per second: 51, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: -0.017 [-1.641, 1.390], loss: 0.134886, mean_absolute_error: 1.976837, mean_q: 3.747192


   512/50000: episode: 25, duration: 0.852s, episode steps: 35, steps per second: 41, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.030 [-0.602, 0.918], loss: 0.160525, mean_absolute_error: 2.203208, mean_q: 4.186751


   565/50000: episode: 26, duration: 0.885s, episode steps: 53, steps per second: 60, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.097 [-1.153, 1.756], loss: 0.120412, mean_absolute_error: 2.313999, mean_q: 4.467968


   594/50000: episode: 27, duration: 0.479s, episode steps: 29, steps per second: 61, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.414 [0.000, 1.000], mean observation: -0.026 [-1.173, 1.554], loss: 0.144019, mean_absolute_error: 2.465998, mean_q: 4.799965


   612/50000: episode: 28, duration: 0.332s, episode steps: 18, steps per second: 54, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.098 [-1.233, 0.737], loss: 0.168141, mean_absolute_error: 2.555117, mean_q: 4.945767


   632/50000: episode: 29, duration: 0.339s, episode steps: 20, steps per second: 59, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.066 [-1.678, 0.848], loss: 0.162648, mean_absolute_error: 2.671374, mean_q: 5.211683


   663/50000: episode: 30, duration: 0.511s, episode steps: 31, steps per second: 61, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.072 [-0.934, 1.537], loss: 0.191590, mean_absolute_error: 2.771890, mean_q: 5.393662


   703/50000: episode: 31, duration: 0.664s, episode steps: 40, steps per second: 60, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.425 [0.000, 1.000], mean observation: -0.165 [-1.128, 0.409], loss: 0.197236, mean_absolute_error: 2.901091, mean_q: 5.650207


   780/50000: episode: 32, duration: 1.283s, episode steps: 77, steps per second: 60, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.050 [-1.009, 1.928], loss: 0.230078, mean_absolute_error: 3.161609, mean_q: 6.177276


   851/50000: episode: 33, duration: 1.183s, episode steps: 71, steps per second: 60, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.002 [-1.300, 0.991], loss: 0.257001, mean_absolute_error: 3.433844, mean_q: 6.720691


   863/50000: episode: 34, duration: 0.204s, episode steps: 12, steps per second: 59, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.119 [-1.322, 2.234], loss: 0.296808, mean_absolute_error: 3.620712, mean_q: 7.146391


   915/50000: episode: 35, duration: 0.868s, episode steps: 52, steps per second: 60, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.088 [-1.129, 0.717], loss: 0.218578, mean_absolute_error: 3.738826, mean_q: 7.450445


   941/50000: episode: 36, duration: 0.450s, episode steps: 26, steps per second: 58, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.115 [-0.900, 0.564], loss: 0.278431, mean_absolute_error: 3.896256, mean_q: 7.747449


   980/50000: episode: 37, duration: 0.641s, episode steps: 39, steps per second: 61, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.055 [-1.339, 1.807], loss: 0.247206, mean_absolute_error: 4.079473, mean_q: 8.118120


  1008/50000: episode: 38, duration: 0.466s, episode steps: 28, steps per second: 60, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.103 [-0.571, 1.108], loss: 0.251182, mean_absolute_error: 4.217108, mean_q: 8.540570


  1026/50000: episode: 39, duration: 0.299s, episode steps: 18, steps per second: 60, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-0.914, 0.401], loss: 0.326352, mean_absolute_error: 4.326636, mean_q: 8.660825


  1142/50000: episode: 40, duration: 1.936s, episode steps: 116, steps per second: 60, episode reward: 116.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.128 [-0.839, 1.084], loss: 0.317452, mean_absolute_error: 4.609283, mean_q: 9.287205


  1239/50000: episode: 41, duration: 1.613s, episode steps: 97, steps per second: 60, episode reward: 97.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.022 [-1.027, 1.292], loss: 0.387872, mean_absolute_error: 5.122484, mean_q: 10.291980


  1314/50000: episode: 42, duration: 1.249s, episode steps: 75, steps per second: 60, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.547 [0.000, 1.000], mean observation: 0.016 [-2.273, 1.326], loss: 0.466937, mean_absolute_error: 5.446654, mean_q: 10.862349


  1395/50000: episode: 43, duration: 1.349s, episode steps: 81, steps per second: 60, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: 0.369 [-0.929, 1.537], loss: 0.384960, mean_absolute_error: 5.786071, mean_q: 11.666390


  1559/50000: episode: 44, duration: 2.749s, episode steps: 164, steps per second: 60, episode reward: 164.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.129 [-1.827, 1.050], loss: 0.370210, mean_absolute_error: 6.308635, mean_q: 12.755812


  1659/50000: episode: 45, duration: 1.666s, episode steps: 100, steps per second: 60, episode reward: 100.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.108 [-0.931, 0.810], loss: 0.427190, mean_absolute_error: 6.867767, mean_q: 13.912158


  1825/50000: episode: 46, duration: 2.788s, episode steps: 166, steps per second: 60, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.078 [-1.170, 0.842], loss: 0.431308, mean_absolute_error: 7.499765, mean_q: 15.248272


  1999/50000: episode: 47, duration: 2.894s, episode steps: 174, steps per second: 60, episode reward: 174.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.123 [-1.375, 1.339], loss: 0.565792, mean_absolute_error: 8.297193, mean_q: 16.842787


  2110/50000: episode: 48, duration: 1.866s, episode steps: 111, steps per second: 59, episode reward: 111.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.209 [-1.550, 1.385], loss: 0.499934, mean_absolute_error: 8.964447, mean_q: 18.196674


  2310/50000: episode: 49, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.059 [-1.159, 1.333], loss: 0.783638, mean_absolute_error: 9.634171, mean_q: 19.547714


  2510/50000: episode: 50, duration: 3.352s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.010 [-1.179, 1.432], loss: 0.817176, mean_absolute_error: 10.603964, mean_q: 21.548399


  2710/50000: episode: 51, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.229 [-1.018, 1.883], loss: 1.143055, mean_absolute_error: 11.438346, mean_q: 23.191139


  2910/50000: episode: 52, duration: 3.332s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.029 [-1.420, 1.484], loss: 0.994509, mean_absolute_error: 12.371805, mean_q: 25.175171


  3104/50000: episode: 53, duration: 3.250s, episode steps: 194, steps per second: 60, episode reward: 194.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.214 [-1.424, 0.957], loss: 1.415034, mean_absolute_error: 13.330706, mean_q: 27.144775


  3304/50000: episode: 54, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.143 [-1.104, 1.056], loss: 1.544357, mean_absolute_error: 14.252715, mean_q: 28.952637


  3504/50000: episode: 55, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.253 [-1.667, 1.105], loss: 2.201223, mean_absolute_error: 15.045889, mean_q: 30.588306


  3704/50000: episode: 56, duration: 3.332s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.315 [-2.173, 1.034], loss: 1.648729, mean_absolute_error: 16.026667, mean_q: 32.606407


  3904/50000: episode: 57, duration: 3.350s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.083 [-1.109, 1.316], loss: 1.729986, mean_absolute_error: 16.825930, mean_q: 34.208977


  4104/50000: episode: 58, duration: 3.463s, episode steps: 200, steps per second: 58, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.282 [-1.878, 0.885], loss: 2.805698, mean_absolute_error: 17.685783, mean_q: 35.868931


  4304/50000: episode: 59, duration: 3.397s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.116 [-1.357, 1.063], loss: 2.937390, mean_absolute_error: 18.537958, mean_q: 37.602909


  4504/50000: episode: 60, duration: 3.397s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.081 [-1.009, 1.269], loss: 2.954977, mean_absolute_error: 19.420160, mean_q: 39.333282


  4704/50000: episode: 61, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.063 [-0.995, 1.137], loss: 2.499840, mean_absolute_error: 20.214088, mean_q: 41.062363


  4904/50000: episode: 62, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.355 [-2.368, 0.721], loss: 2.897238, mean_absolute_error: 21.038155, mean_q: 42.774883


  5104/50000: episode: 63, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.235 [-1.637, 0.794], loss: 2.842201, mean_absolute_error: 21.754837, mean_q: 44.231827


  5300/50000: episode: 64, duration: 3.264s, episode steps: 196, steps per second: 60, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.345 [-2.430, 0.731], loss: 4.782068, mean_absolute_error: 22.560257, mean_q: 45.634209


  5496/50000: episode: 65, duration: 3.262s, episode steps: 196, steps per second: 60, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.342 [-2.414, 0.875], loss: 3.854712, mean_absolute_error: 23.287458, mean_q: 47.119041


  5696/50000: episode: 66, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.270 [-1.923, 0.744], loss: 2.809205, mean_absolute_error: 23.925388, mean_q: 48.524170


  5896/50000: episode: 67, duration: 3.379s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.227 [-1.538, 0.840], loss: 3.641417, mean_absolute_error: 24.510405, mean_q: 49.569481


  6096/50000: episode: 68, duration: 3.346s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.120 [-1.095, 1.078], loss: 3.639603, mean_absolute_error: 25.308300, mean_q: 51.173721


  6296/50000: episode: 69, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.172 [-1.286, 0.852], loss: 3.673989, mean_absolute_error: 25.990150, mean_q: 52.630253


  6496/50000: episode: 70, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.120 [-1.098, 0.886], loss: 3.601358, mean_absolute_error: 26.593040, mean_q: 53.842674


  6696/50000: episode: 71, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.057 [-0.845, 1.101], loss: 3.345765, mean_absolute_error: 27.279478, mean_q: 55.235161


  6896/50000: episode: 72, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.008 [-0.929, 1.156], loss: 5.682310, mean_absolute_error: 27.941710, mean_q: 56.392181


  7096/50000: episode: 73, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.280 [-1.994, 0.942], loss: 6.775297, mean_absolute_error: 28.483030, mean_q: 57.497814


  7296/50000: episode: 74, duration: 3.328s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-0.827, 0.857], loss: 4.860448, mean_absolute_error: 28.938227, mean_q: 58.420326


  7496/50000: episode: 75, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.021 [-0.932, 0.969], loss: 6.235990, mean_absolute_error: 29.487724, mean_q: 59.607700


  7696/50000: episode: 76, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.036 [-0.919, 1.185], loss: 6.034208, mean_absolute_error: 29.963306, mean_q: 60.578445


  7889/50000: episode: 77, duration: 3.214s, episode steps: 193, steps per second: 60, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.365 [-2.409, 1.007], loss: 4.306662, mean_absolute_error: 30.410147, mean_q: 61.472466


  8089/50000: episode: 78, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.044 [-1.065, 1.143], loss: 6.785807, mean_absolute_error: 30.995258, mean_q: 62.411400


  8289/50000: episode: 79, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.318 [-2.172, 1.072], loss: 5.014330, mean_absolute_error: 31.330175, mean_q: 63.268917


  8482/50000: episode: 80, duration: 3.213s, episode steps: 193, steps per second: 60, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.368 [-2.424, 0.923], loss: 7.045807, mean_absolute_error: 31.592525, mean_q: 63.669609


  8682/50000: episode: 81, duration: 3.332s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.064 [-1.043, 1.100], loss: 6.988051, mean_absolute_error: 31.885330, mean_q: 64.284470


  8882/50000: episode: 82, duration: 3.327s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.185 [-1.441, 0.843], loss: 8.183540, mean_absolute_error: 32.260658, mean_q: 64.968765


  9082/50000: episode: 83, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.007 [-0.988, 1.232], loss: 7.260355, mean_absolute_error: 32.704689, mean_q: 65.919014


  9282/50000: episode: 84, duration: 3.362s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.022 [-1.055, 1.269], loss: 7.485858, mean_absolute_error: 33.179852, mean_q: 66.885544


  9482/50000: episode: 85, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.044 [-0.947, 0.936], loss: 7.232793, mean_absolute_error: 33.530106, mean_q: 67.586571


  9682/50000: episode: 86, duration: 3.346s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.063 [-0.907, 1.496], loss: 8.333709, mean_absolute_error: 33.939754, mean_q: 68.332565


  9882/50000: episode: 87, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.170 [-0.976, 1.543], loss: 8.429142, mean_absolute_error: 34.419849, mean_q: 69.263573


 10082/50000: episode: 88, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.153 [-1.152, 1.219], loss: 9.559345, mean_absolute_error: 34.604782, mean_q: 69.615211


 10269/50000: episode: 89, duration: 3.114s, episode steps: 187, steps per second: 60, episode reward: 187.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: 0.350 [-1.109, 2.531], loss: 8.923779, mean_absolute_error: 35.032452, mean_q: 70.528793


 10469/50000: episode: 90, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.014 [-0.950, 1.235], loss: 6.868696, mean_absolute_error: 35.365711, mean_q: 71.253685


 10669/50000: episode: 91, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.215 [-1.593, 1.019], loss: 7.350414, mean_absolute_error: 35.760872, mean_q: 71.953552


 10869/50000: episode: 92, duration: 3.380s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.159 [-1.333, 1.011], loss: 8.088304, mean_absolute_error: 35.947086, mean_q: 72.369888


 11069/50000: episode: 93, duration: 3.379s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.134 [-1.116, 1.212], loss: 8.331317, mean_absolute_error: 36.126957, mean_q: 72.633553


 11269/50000: episode: 94, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.012 [-1.201, 1.018], loss: 7.816817, mean_absolute_error: 36.415428, mean_q: 73.320007


 11469/50000: episode: 95, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.122 [-1.161, 1.424], loss: 5.503520, mean_absolute_error: 36.754253, mean_q: 74.058357


 11669/50000: episode: 96, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.179 [-1.279, 1.155], loss: 7.636459, mean_absolute_error: 37.144310, mean_q: 74.776291


 11860/50000: episode: 97, duration: 3.179s, episode steps: 191, steps per second: 60, episode reward: 191.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.383 [-2.421, 0.714], loss: 7.826077, mean_absolute_error: 37.327934, mean_q: 75.193260


 12060/50000: episode: 98, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.051 [-1.107, 1.009], loss: 7.704801, mean_absolute_error: 37.435135, mean_q: 75.330658


 12260/50000: episode: 99, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.004 [-1.228, 1.290], loss: 11.234785, mean_absolute_error: 37.447701, mean_q: 75.377922


 12460/50000: episode: 100, duration: 3.344s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.073 [-1.372, 1.246], loss: 9.942094, mean_absolute_error: 37.668995, mean_q: 75.810844


 12660/50000: episode: 101, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.252 [-1.685, 1.183], loss: 9.584633, mean_absolute_error: 37.950367, mean_q: 76.340637


 12860/50000: episode: 102, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.264 [-1.779, 1.140], loss: 9.365086, mean_absolute_error: 38.170792, mean_q: 76.895950


 13060/50000: episode: 103, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.094 [-1.167, 1.311], loss: 6.325462, mean_absolute_error: 38.497597, mean_q: 77.611984


 13260/50000: episode: 104, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.243 [-1.689, 1.123], loss: 7.528147, mean_absolute_error: 38.672573, mean_q: 77.906395


 13460/50000: episode: 105, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.154 [-1.134, 0.816], loss: 6.973731, mean_absolute_error: 38.868790, mean_q: 78.296532


 13660/50000: episode: 106, duration: 3.353s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.192 [-1.363, 1.285], loss: 7.847457, mean_absolute_error: 38.887848, mean_q: 78.365265


 13860/50000: episode: 107, duration: 3.321s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.234 [-1.024, 2.226], loss: 6.276882, mean_absolute_error: 38.970905, mean_q: 78.581429


 14060/50000: episode: 108, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.157 [-1.120, 1.512], loss: 10.426866, mean_absolute_error: 39.248528, mean_q: 78.792519


 14260/50000: episode: 109, duration: 3.340s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.017 [-1.075, 1.204], loss: 4.597863, mean_absolute_error: 39.707062, mean_q: 80.037041


 14460/50000: episode: 110, duration: 3.320s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.097 [-1.187, 1.353], loss: 8.820493, mean_absolute_error: 39.756004, mean_q: 79.995842


 14660/50000: episode: 111, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.229 [-1.511, 0.837], loss: 8.013705, mean_absolute_error: 39.721287, mean_q: 79.952263


 14860/50000: episode: 112, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.025 [-1.235, 1.139], loss: 7.504385, mean_absolute_error: 40.272072, mean_q: 81.057182


 15060/50000: episode: 113, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.203 [-1.158, 1.851], loss: 9.308469, mean_absolute_error: 40.483475, mean_q: 81.314713


 15260/50000: episode: 114, duration: 3.342s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.225 [-1.442, 1.281], loss: 9.730762, mean_absolute_error: 40.330994, mean_q: 81.093567


 15460/50000: episode: 115, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.112 [-1.173, 1.094], loss: 7.054383, mean_absolute_error: 40.573460, mean_q: 81.761574


 15611/50000: episode: 116, duration: 2.513s, episode steps: 151, steps per second: 60, episode reward: 151.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.394 [-1.305, 3.111], loss: 9.489411, mean_absolute_error: 40.474979, mean_q: 81.426765


 15811/50000: episode: 117, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.328 [-2.136, 1.011], loss: 5.290573, mean_absolute_error: 40.571583, mean_q: 81.732880


 16011/50000: episode: 118, duration: 3.380s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.318 [-2.000, 1.217], loss: 6.500682, mean_absolute_error: 40.773876, mean_q: 82.178360


 16211/50000: episode: 119, duration: 3.328s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.329 [-2.097, 1.088], loss: 6.972532, mean_absolute_error: 40.772411, mean_q: 82.064514


 16411/50000: episode: 120, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.097 [-1.310, 1.453], loss: 6.344914, mean_absolute_error: 40.498985, mean_q: 81.563019


 16611/50000: episode: 121, duration: 3.326s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.204 [-1.341, 1.377], loss: 7.196208, mean_absolute_error: 41.012829, mean_q: 82.599808


 16783/50000: episode: 122, duration: 2.865s, episode steps: 172, steps per second: 60, episode reward: 172.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.341 [-1.335, 3.139], loss: 5.073210, mean_absolute_error: 40.860943, mean_q: 82.365211


 16983/50000: episode: 123, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.228 [-1.497, 1.366], loss: 6.609347, mean_absolute_error: 41.007252, mean_q: 82.585785


 17183/50000: episode: 124, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.231 [-1.475, 0.927], loss: 14.076356, mean_absolute_error: 41.060516, mean_q: 82.359314


 17383/50000: episode: 125, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.208 [-1.345, 1.162], loss: 5.381383, mean_absolute_error: 40.755219, mean_q: 82.096893


 17579/50000: episode: 126, duration: 3.263s, episode steps: 196, steps per second: 60, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.358 [-2.404, 1.048], loss: 8.017097, mean_absolute_error: 41.036213, mean_q: 82.615166


 17779/50000: episode: 127, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.188 [-1.278, 0.999], loss: 4.470975, mean_absolute_error: 41.186226, mean_q: 83.028084


 17979/50000: episode: 128, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.225 [-1.469, 1.359], loss: 8.047194, mean_absolute_error: 41.106480, mean_q: 82.702209


 18166/50000: episode: 129, duration: 3.114s, episode steps: 187, steps per second: 60, episode reward: 187.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.330 [-1.193, 2.557], loss: 7.185256, mean_absolute_error: 41.101608, mean_q: 82.574135


 18351/50000: episode: 130, duration: 3.081s, episode steps: 185, steps per second: 60, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: 0.340 [-0.888, 2.550], loss: 4.900700, mean_absolute_error: 40.984127, mean_q: 82.438812


 18551/50000: episode: 131, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.167 [-1.362, 1.561], loss: 5.964108, mean_absolute_error: 41.163506, mean_q: 82.772285


 18690/50000: episode: 132, duration: 2.313s, episode steps: 139, steps per second: 60, episode reward: 139.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.540 [0.000, 1.000], mean observation: 0.439 [-1.283, 2.443], loss: 4.803247, mean_absolute_error: 41.144333, mean_q: 82.833450


 18890/50000: episode: 133, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.247 [-1.228, 2.415], loss: 7.458500, mean_absolute_error: 41.271187, mean_q: 83.005341


 19090/50000: episode: 134, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.061 [-1.123, 0.996], loss: 10.219311, mean_absolute_error: 41.098221, mean_q: 82.582970


 19290/50000: episode: 135, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.069 [-1.527, 1.443], loss: 10.723494, mean_absolute_error: 41.233009, mean_q: 82.769157


 19490/50000: episode: 136, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.092 [-1.146, 1.029], loss: 9.765299, mean_absolute_error: 41.176708, mean_q: 82.723595


 19690/50000: episode: 137, duration: 3.395s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.125 [-1.148, 1.306], loss: 8.624767, mean_absolute_error: 41.131718, mean_q: 82.661964


 19890/50000: episode: 138, duration: 3.347s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.182 [-1.128, 1.301], loss: 7.723626, mean_absolute_error: 41.074146, mean_q: 82.595528


 20090/50000: episode: 139, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.171 [-1.101, 0.881], loss: 6.143629, mean_absolute_error: 41.138439, mean_q: 82.925880


 20290/50000: episode: 140, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.240 [-1.371, 1.141], loss: 8.712115, mean_absolute_error: 41.479149, mean_q: 83.397217


 20490/50000: episode: 141, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.118 [-1.238, 1.016], loss: 6.981825, mean_absolute_error: 40.908009, mean_q: 82.348274


 20690/50000: episode: 142, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.145 [-1.073, 1.017], loss: 7.278175, mean_absolute_error: 41.297909, mean_q: 83.037560


 20828/50000: episode: 143, duration: 2.296s, episode steps: 138, steps per second: 60, episode reward: 138.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: 0.422 [-1.192, 2.829], loss: 4.094743, mean_absolute_error: 41.511478, mean_q: 83.590691


 21028/50000: episode: 144, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.146 [-1.171, 1.287], loss: 7.193943, mean_absolute_error: 41.496914, mean_q: 83.412521


 21228/50000: episode: 145, duration: 3.336s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.076 [-1.369, 1.090], loss: 6.105279, mean_absolute_error: 41.526543, mean_q: 83.494720


 21428/50000: episode: 146, duration: 3.323s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.097 [-1.172, 1.137], loss: 5.304854, mean_absolute_error: 41.384396, mean_q: 83.244911


 21628/50000: episode: 147, duration: 3.332s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.109 [-1.287, 1.165], loss: 10.126225, mean_absolute_error: 41.570267, mean_q: 83.320770


 21777/50000: episode: 148, duration: 2.479s, episode steps: 149, steps per second: 60, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.351 [-1.323, 2.232], loss: 7.317624, mean_absolute_error: 41.260113, mean_q: 82.916786


 21977/50000: episode: 149, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.075 [-1.183, 1.338], loss: 6.789692, mean_absolute_error: 41.370644, mean_q: 83.104256


 22177/50000: episode: 150, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.101 [-1.323, 1.394], loss: 7.065582, mean_absolute_error: 41.639538, mean_q: 83.669464


 22317/50000: episode: 151, duration: 2.331s, episode steps: 140, steps per second: 60, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.367 [-1.646, 2.610], loss: 8.286376, mean_absolute_error: 41.787033, mean_q: 83.864922


 22517/50000: episode: 152, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.047 [-1.318, 1.151], loss: 5.841610, mean_absolute_error: 41.539505, mean_q: 83.414246


 22717/50000: episode: 153, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.141 [-1.381, 1.487], loss: 6.795452, mean_absolute_error: 41.471523, mean_q: 83.324745


 22892/50000: episode: 154, duration: 2.914s, episode steps: 175, steps per second: 60, episode reward: 175.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.348 [-1.199, 2.542], loss: 7.732283, mean_absolute_error: 41.462852, mean_q: 83.260826


 23058/50000: episode: 155, duration: 2.764s, episode steps: 166, steps per second: 60, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.346 [-1.391, 2.409], loss: 7.521837, mean_absolute_error: 41.439873, mean_q: 83.092995


 23200/50000: episode: 156, duration: 2.364s, episode steps: 142, steps per second: 60, episode reward: 142.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.320 [-1.612, 1.034], loss: 8.356171, mean_absolute_error: 41.340145, mean_q: 82.848465


 23400/50000: episode: 157, duration: 3.367s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.305 [-2.087, 0.957], loss: 9.160895, mean_absolute_error: 40.944290, mean_q: 82.054642


 23549/50000: episode: 158, duration: 2.476s, episode steps: 149, steps per second: 60, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.342 [-1.821, 1.135], loss: 9.354344, mean_absolute_error: 41.059994, mean_q: 82.288261


 23749/50000: episode: 159, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.143 [-1.270, 1.274], loss: 10.909944, mean_absolute_error: 40.744701, mean_q: 81.655411


 23949/50000: episode: 160, duration: 3.341s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.154 [-1.204, 1.197], loss: 8.927660, mean_absolute_error: 40.066788, mean_q: 80.310974


 24149/50000: episode: 161, duration: 3.380s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.100 [-1.174, 1.080], loss: 8.164768, mean_absolute_error: 40.650330, mean_q: 81.603416


 24315/50000: episode: 162, duration: 2.780s, episode steps: 166, steps per second: 60, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.330 [-1.441, 2.451], loss: 7.205022, mean_absolute_error: 40.330662, mean_q: 81.020668


 24456/50000: episode: 163, duration: 2.352s, episode steps: 141, steps per second: 60, episode reward: 141.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.344 [-1.539, 2.573], loss: 4.887612, mean_absolute_error: 40.313606, mean_q: 81.019043


 24606/50000: episode: 164, duration: 2.492s, episode steps: 150, steps per second: 60, episode reward: 150.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.328 [-1.427, 2.634], loss: 3.034650, mean_absolute_error: 39.923351, mean_q: 80.331139


 24806/50000: episode: 165, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.053 [-1.573, 1.319], loss: 5.971332, mean_absolute_error: 40.267365, mean_q: 80.946732


 24950/50000: episode: 166, duration: 2.414s, episode steps: 144, steps per second: 60, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.362 [-1.616, 2.753], loss: 6.259516, mean_absolute_error: 40.125290, mean_q: 80.523575


 25135/50000: episode: 167, duration: 3.099s, episode steps: 185, steps per second: 60, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.317 [-1.681, 2.997], loss: 3.957797, mean_absolute_error: 40.353313, mean_q: 81.183220


 25335/50000: episode: 168, duration: 3.327s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.104 [-1.600, 1.428], loss: 3.475776, mean_absolute_error: 40.260189, mean_q: 80.974953


 25522/50000: episode: 169, duration: 3.147s, episode steps: 187, steps per second: 59, episode reward: 187.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.190 [-1.456, 2.043], loss: 6.564630, mean_absolute_error: 40.408291, mean_q: 81.111267


 25690/50000: episode: 170, duration: 2.797s, episode steps: 168, steps per second: 60, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.215 [-1.362, 2.082], loss: 5.851522, mean_absolute_error: 40.019310, mean_q: 80.334091


 25841/50000: episode: 171, duration: 2.514s, episode steps: 151, steps per second: 60, episode reward: 151.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.255 [-1.590, 2.399], loss: 4.172309, mean_absolute_error: 40.271282, mean_q: 80.785423


 25974/50000: episode: 172, duration: 2.214s, episode steps: 133, steps per second: 60, episode reward: 133.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.287 [-1.779, 2.483], loss: 8.372341, mean_absolute_error: 40.172867, mean_q: 80.483032


 26174/50000: episode: 173, duration: 3.343s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.188 [-1.435, 1.889], loss: 6.245988, mean_absolute_error: 39.950802, mean_q: 80.152985


 26341/50000: episode: 174, duration: 2.783s, episode steps: 167, steps per second: 60, episode reward: 167.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.198 [-1.439, 2.015], loss: 4.477818, mean_absolute_error: 39.508732, mean_q: 79.418434


 26539/50000: episode: 175, duration: 3.295s, episode steps: 198, steps per second: 60, episode reward: 198.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.174 [-1.706, 2.285], loss: 7.822410, mean_absolute_error: 39.955921, mean_q: 80.110657


 26739/50000: episode: 176, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.030 [-1.329, 1.265], loss: 5.459524, mean_absolute_error: 39.323433, mean_q: 78.931229


 26864/50000: episode: 177, duration: 2.082s, episode steps: 125, steps per second: 60, episode reward: 125.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.253 [-1.710, 2.277], loss: 4.711681, mean_absolute_error: 40.050774, mean_q: 80.357826


 27004/50000: episode: 178, duration: 2.329s, episode steps: 140, steps per second: 60, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.205 [-1.841, 2.270], loss: 6.388546, mean_absolute_error: 39.643883, mean_q: 79.556923


 27162/50000: episode: 179, duration: 2.632s, episode steps: 158, steps per second: 60, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.267 [-2.133, 2.821], loss: 4.809639, mean_absolute_error: 39.862347, mean_q: 80.037842


 27362/50000: episode: 180, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.113 [-1.918, 2.464], loss: 5.810243, mean_absolute_error: 39.456051, mean_q: 79.254044


 27562/50000: episode: 181, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.115 [-1.561, 2.097], loss: 5.218877, mean_absolute_error: 39.474342, mean_q: 79.173302


 27727/50000: episode: 182, duration: 2.747s, episode steps: 165, steps per second: 60, episode reward: 165.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.182 [-1.405, 2.028], loss: 3.418368, mean_absolute_error: 39.782906, mean_q: 79.962280


 27927/50000: episode: 183, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.130 [-1.616, 2.084], loss: 3.623908, mean_absolute_error: 39.543060, mean_q: 79.444054


 28127/50000: episode: 184, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.055 [-1.505, 1.358], loss: 4.640255, mean_absolute_error: 39.059113, mean_q: 78.449699


 28257/50000: episode: 185, duration: 2.164s, episode steps: 130, steps per second: 60, episode reward: 130.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.246 [-1.688, 2.276], loss: 7.131168, mean_absolute_error: 39.714821, mean_q: 79.578278


 28447/50000: episode: 186, duration: 3.175s, episode steps: 190, steps per second: 60, episode reward: 190.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.180 [-1.589, 2.435], loss: 4.160255, mean_absolute_error: 39.297199, mean_q: 78.906433


 28611/50000: episode: 187, duration: 2.732s, episode steps: 164, steps per second: 60, episode reward: 164.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.155 [-1.356, 2.159], loss: 4.761941, mean_absolute_error: 39.401691, mean_q: 79.125282


 28756/50000: episode: 188, duration: 2.414s, episode steps: 145, steps per second: 60, episode reward: 145.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.199 [-1.546, 1.846], loss: 7.005396, mean_absolute_error: 39.515121, mean_q: 79.293228


 28863/50000: episode: 189, duration: 1.781s, episode steps: 107, steps per second: 60, episode reward: 107.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.202 [-1.402, 1.682], loss: 4.202790, mean_absolute_error: 38.984013, mean_q: 78.327255


 29019/50000: episode: 190, duration: 2.598s, episode steps: 156, steps per second: 60, episode reward: 156.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.186 [-1.058, 1.525], loss: 1.878162, mean_absolute_error: 39.158073, mean_q: 78.765747


 29215/50000: episode: 191, duration: 3.263s, episode steps: 196, steps per second: 60, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.123 [-1.809, 2.286], loss: 7.341709, mean_absolute_error: 39.426781, mean_q: 78.953735


 29383/50000: episode: 192, duration: 2.810s, episode steps: 168, steps per second: 60, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.151 [-1.246, 1.703], loss: 4.559742, mean_absolute_error: 39.076740, mean_q: 78.372131


 29548/50000: episode: 193, duration: 2.748s, episode steps: 165, steps per second: 60, episode reward: 165.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.159 [-1.415, 1.900], loss: 4.491376, mean_absolute_error: 39.194859, mean_q: 78.606346


 29748/50000: episode: 194, duration: 3.381s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.072 [-1.691, 2.053], loss: 8.914982, mean_absolute_error: 39.336926, mean_q: 78.720749


 29948/50000: episode: 195, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.250 [-1.713, 1.943], loss: 2.839404, mean_absolute_error: 38.813293, mean_q: 77.950363


 30148/50000: episode: 196, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.277 [-1.564, 1.755], loss: 3.677955, mean_absolute_error: 38.797733, mean_q: 77.899490


 30348/50000: episode: 197, duration: 3.346s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.285 [-1.273, 2.169], loss: 4.568062, mean_absolute_error: 38.958473, mean_q: 78.235504


 30548/50000: episode: 198, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.243 [-1.512, 2.042], loss: 6.840321, mean_absolute_error: 38.661770, mean_q: 77.513504


 30748/50000: episode: 199, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.297 [-1.397, 1.928], loss: 5.704790, mean_absolute_error: 38.662525, mean_q: 77.550354


 30909/50000: episode: 200, duration: 2.681s, episode steps: 161, steps per second: 60, episode reward: 161.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.345 [-1.644, 2.028], loss: 6.890038, mean_absolute_error: 38.655964, mean_q: 77.617867


 31067/50000: episode: 201, duration: 2.631s, episode steps: 158, steps per second: 60, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.367 [-1.677, 2.469], loss: 5.182693, mean_absolute_error: 37.923199, mean_q: 76.077232


 31230/50000: episode: 202, duration: 2.713s, episode steps: 163, steps per second: 60, episode reward: 163.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.323 [-1.230, 1.848], loss: 7.482160, mean_absolute_error: 38.518490, mean_q: 77.107208


 31406/50000: episode: 203, duration: 2.930s, episode steps: 176, steps per second: 60, episode reward: 176.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.359 [-1.276, 2.344], loss: 6.597456, mean_absolute_error: 38.138615, mean_q: 76.467903


 31524/50000: episode: 204, duration: 1.964s, episode steps: 118, steps per second: 60, episode reward: 118.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.179 [-1.506, 1.862], loss: 4.216923, mean_absolute_error: 38.325310, mean_q: 76.982994


 31692/50000: episode: 205, duration: 2.797s, episode steps: 168, steps per second: 60, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.328 [-1.529, 2.412], loss: 6.642649, mean_absolute_error: 37.790771, mean_q: 75.663177


 31857/50000: episode: 206, duration: 2.747s, episode steps: 165, steps per second: 60, episode reward: 165.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.409 [-1.702, 2.362], loss: 2.959098, mean_absolute_error: 37.739086, mean_q: 75.844421


 32001/50000: episode: 207, duration: 2.397s, episode steps: 144, steps per second: 60, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.147 [-1.467, 1.682], loss: 2.823395, mean_absolute_error: 38.069424, mean_q: 76.501831


 32164/50000: episode: 208, duration: 2.713s, episode steps: 163, steps per second: 60, episode reward: 163.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.437 [-1.650, 2.318], loss: 5.812769, mean_absolute_error: 37.693268, mean_q: 75.538322


 32314/50000: episode: 209, duration: 2.507s, episode steps: 150, steps per second: 60, episode reward: 150.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.446 [-1.599, 2.302], loss: 3.543967, mean_absolute_error: 37.501415, mean_q: 75.319054


 32441/50000: episode: 210, duration: 2.104s, episode steps: 127, steps per second: 60, episode reward: 127.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.141 [-1.519, 1.658], loss: 1.577616, mean_absolute_error: 38.007172, mean_q: 76.384537


 32641/50000: episode: 211, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.038 [-1.250, 1.044], loss: 4.267598, mean_absolute_error: 38.103409, mean_q: 76.531013


 32841/50000: episode: 212, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.014 [-1.712, 1.706], loss: 4.748281, mean_absolute_error: 37.878101, mean_q: 75.906693


 33041/50000: episode: 213, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.004 [-1.597, 1.735], loss: 3.646704, mean_absolute_error: 38.330956, mean_q: 76.963310


 33241/50000: episode: 214, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.261 [-2.053, 2.471], loss: 6.328341, mean_absolute_error: 38.133556, mean_q: 76.350716


 33441/50000: episode: 215, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.301 [-1.575, 1.962], loss: 5.984219, mean_absolute_error: 37.801689, mean_q: 75.611092


 33641/50000: episode: 216, duration: 3.342s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.152 [-1.578, 2.064], loss: 2.624962, mean_absolute_error: 37.926952, mean_q: 76.158539


 33841/50000: episode: 217, duration: 3.316s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.185 [-2.146, 2.077], loss: 5.490245, mean_absolute_error: 37.844723, mean_q: 75.810638


 34004/50000: episode: 218, duration: 2.714s, episode steps: 163, steps per second: 60, episode reward: 163.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.172 [-1.775, 2.217], loss: 7.692401, mean_absolute_error: 37.677902, mean_q: 75.575516


 34204/50000: episode: 219, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.181 [-1.409, 1.715], loss: 4.411875, mean_absolute_error: 37.731354, mean_q: 75.877098


 34362/50000: episode: 220, duration: 2.631s, episode steps: 158, steps per second: 60, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.111 [-2.081, 2.100], loss: 4.006960, mean_absolute_error: 37.869457, mean_q: 76.008308


 34562/50000: episode: 221, duration: 3.379s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.033 [-1.948, 1.769], loss: 4.233716, mean_absolute_error: 37.827518, mean_q: 75.929916


 34762/50000: episode: 222, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.004 [-1.976, 2.035], loss: 4.881142, mean_absolute_error: 38.054111, mean_q: 76.310860


 34962/50000: episode: 223, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.201 [-2.193, 2.324], loss: 6.984152, mean_absolute_error: 38.082584, mean_q: 76.331276


 35058/50000: episode: 224, duration: 1.615s, episode steps: 96, steps per second: 59, episode reward: 96.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.138 [-1.898, 2.106], loss: 5.456718, mean_absolute_error: 38.401917, mean_q: 77.061478


 35258/50000: episode: 225, duration: 3.326s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.003 [-2.019, 1.659], loss: 5.502890, mean_absolute_error: 38.136139, mean_q: 76.633202


 35458/50000: episode: 226, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.154 [-1.792, 1.900], loss: 6.545844, mean_absolute_error: 38.491299, mean_q: 77.401863


 35658/50000: episode: 227, duration: 3.328s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.089 [-1.931, 1.930], loss: 6.485698, mean_absolute_error: 38.220795, mean_q: 76.813515


 35858/50000: episode: 228, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.184 [-2.074, 2.262], loss: 9.095310, mean_absolute_error: 38.671593, mean_q: 77.516907


 36058/50000: episode: 229, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.022 [-1.629, 1.514], loss: 9.309761, mean_absolute_error: 38.125278, mean_q: 76.431732


 36258/50000: episode: 230, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.034 [-1.805, 1.497], loss: 9.197002, mean_absolute_error: 38.583595, mean_q: 77.319099


 36458/50000: episode: 231, duration: 3.335s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.071 [-1.889, 1.853], loss: 8.875377, mean_absolute_error: 38.637951, mean_q: 77.580521


 36630/50000: episode: 232, duration: 2.891s, episode steps: 172, steps per second: 59, episode reward: 172.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.068 [-1.725, 1.746], loss: 7.023501, mean_absolute_error: 38.627701, mean_q: 77.606255


 36830/50000: episode: 233, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.031 [-2.101, 2.053], loss: 9.897669, mean_absolute_error: 38.997581, mean_q: 78.194603


 37030/50000: episode: 234, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.013 [-0.988, 0.988], loss: 7.907106, mean_absolute_error: 38.623344, mean_q: 77.715363


 37230/50000: episode: 235, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.015 [-1.145, 1.586], loss: 7.794847, mean_absolute_error: 39.019859, mean_q: 78.548996


 37430/50000: episode: 236, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.013 [-1.213, 1.014], loss: 6.552354, mean_absolute_error: 39.153698, mean_q: 78.843605


 37630/50000: episode: 237, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.001 [-1.570, 1.435], loss: 6.758049, mean_absolute_error: 39.328697, mean_q: 79.184807


 37830/50000: episode: 238, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.005 [-1.397, 1.462], loss: 7.174676, mean_absolute_error: 39.489513, mean_q: 79.361145


 38030/50000: episode: 239, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.030 [-1.475, 1.306], loss: 9.937299, mean_absolute_error: 39.580002, mean_q: 79.466309


 38230/50000: episode: 240, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.015 [-1.267, 1.382], loss: 8.120698, mean_absolute_error: 39.227917, mean_q: 78.673607


 38430/50000: episode: 241, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.033 [-1.959, 1.537], loss: 6.980706, mean_absolute_error: 39.871456, mean_q: 80.092995


 38554/50000: episode: 242, duration: 2.082s, episode steps: 124, steps per second: 60, episode reward: 124.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.026 [-1.999, 1.804], loss: 9.760142, mean_absolute_error: 39.760902, mean_q: 79.676201


 38754/50000: episode: 243, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.115 [-1.724, 1.756], loss: 7.712204, mean_absolute_error: 39.763432, mean_q: 79.744087


 38892/50000: episode: 244, duration: 2.296s, episode steps: 138, steps per second: 60, episode reward: 138.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.022 [-1.403, 1.346], loss: 6.143910, mean_absolute_error: 39.758320, mean_q: 79.753746


 39037/50000: episode: 245, duration: 2.414s, episode steps: 145, steps per second: 60, episode reward: 145.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.503 [0.000, 1.000], mean observation: 0.085 [-1.807, 1.850], loss: 6.808954, mean_absolute_error: 39.811577, mean_q: 79.862984


 39144/50000: episode: 246, duration: 1.782s, episode steps: 107, steps per second: 60, episode reward: 107.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.048 [-1.400, 1.930], loss: 15.499659, mean_absolute_error: 40.021317, mean_q: 79.885040


 39344/50000: episode: 247, duration: 3.350s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.028 [-1.750, 1.619], loss: 7.391844, mean_absolute_error: 39.794327, mean_q: 79.815887


 39518/50000: episode: 248, duration: 2.892s, episode steps: 174, steps per second: 60, episode reward: 174.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-1.769, 2.008], loss: 8.387716, mean_absolute_error: 39.781498, mean_q: 79.788971


 39718/50000: episode: 249, duration: 3.348s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.015 [-1.769, 2.078], loss: 9.452872, mean_absolute_error: 39.963799, mean_q: 80.241676


 39918/50000: episode: 250, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.031 [-1.842, 1.559], loss: 6.481316, mean_absolute_error: 40.136406, mean_q: 80.692398


 40118/50000: episode: 251, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.008 [-1.128, 1.291], loss: 7.358675, mean_absolute_error: 40.384796, mean_q: 81.088615


 40318/50000: episode: 252, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.040 [-1.734, 2.007], loss: 9.038155, mean_absolute_error: 40.240730, mean_q: 80.994377


 40518/50000: episode: 253, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.007 [-1.792, 1.767], loss: 7.005382, mean_absolute_error: 40.250507, mean_q: 81.021202


 40718/50000: episode: 254, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.035 [-1.831, 1.448], loss: 6.675478, mean_absolute_error: 40.619652, mean_q: 81.617416


 40918/50000: episode: 255, duration: 3.342s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.030 [-2.110, 1.857], loss: 11.094107, mean_absolute_error: 40.472954, mean_q: 81.058601


 41118/50000: episode: 256, duration: 3.333s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.005 [-1.593, 1.525], loss: 5.173509, mean_absolute_error: 40.614323, mean_q: 81.795280


 41318/50000: episode: 257, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.010 [-1.786, 1.692], loss: 9.145093, mean_absolute_error: 41.069775, mean_q: 82.517311


 41518/50000: episode: 258, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.042 [-1.866, 1.915], loss: 12.196655, mean_absolute_error: 40.936127, mean_q: 81.857689


 41718/50000: episode: 259, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-1.818, 1.445], loss: 11.258022, mean_absolute_error: 41.064049, mean_q: 82.215202


 41918/50000: episode: 260, duration: 3.350s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.041 [-1.700, 1.567], loss: 10.443887, mean_absolute_error: 40.815121, mean_q: 81.905594


 42118/50000: episode: 261, duration: 3.326s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.106 [-1.773, 1.529], loss: 10.006483, mean_absolute_error: 41.151291, mean_q: 82.496185


 42318/50000: episode: 262, duration: 3.363s, episode steps: 200, steps per second: 59, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.001 [-2.060, 1.703], loss: 10.812065, mean_absolute_error: 40.964619, mean_q: 82.173134


 42518/50000: episode: 263, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.015 [-1.263, 1.595], loss: 6.146927, mean_absolute_error: 41.071877, mean_q: 82.490059


 42718/50000: episode: 264, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.002 [-2.043, 2.152], loss: 11.599335, mean_absolute_error: 41.477852, mean_q: 83.024170


 42864/50000: episode: 265, duration: 2.430s, episode steps: 146, steps per second: 60, episode reward: 146.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.037 [-1.801, 1.749], loss: 11.038758, mean_absolute_error: 41.235500, mean_q: 82.732056


 42970/50000: episode: 266, duration: 1.764s, episode steps: 106, steps per second: 60, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.084 [-1.801, 1.721], loss: 8.810161, mean_absolute_error: 41.264477, mean_q: 82.863731


 43170/50000: episode: 267, duration: 3.335s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.013 [-1.985, 1.741], loss: 8.529746, mean_absolute_error: 41.295975, mean_q: 83.060570


 43370/50000: episode: 268, duration: 3.325s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.059 [-1.374, 1.680], loss: 6.829450, mean_absolute_error: 41.288857, mean_q: 83.145691


 43570/50000: episode: 269, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.140 [-1.315, 1.483], loss: 14.877798, mean_absolute_error: 41.511288, mean_q: 83.291901


 43770/50000: episode: 270, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.014 [-1.086, 1.283], loss: 11.914121, mean_absolute_error: 41.645176, mean_q: 83.693916


 43970/50000: episode: 271, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.005 [-1.424, 1.150], loss: 9.789321, mean_absolute_error: 41.452797, mean_q: 83.301857


 44170/50000: episode: 272, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.002 [-1.673, 1.577], loss: 10.794568, mean_absolute_error: 41.739220, mean_q: 83.721832


 44370/50000: episode: 273, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.008 [-1.348, 1.436], loss: 9.255638, mean_absolute_error: 41.917301, mean_q: 84.171173


 44570/50000: episode: 274, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.007 [-1.465, 1.611], loss: 8.055296, mean_absolute_error: 41.813999, mean_q: 83.950256


 44770/50000: episode: 275, duration: 3.347s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.004 [-1.498, 1.472], loss: 12.039726, mean_absolute_error: 41.932003, mean_q: 84.021805


 44970/50000: episode: 276, duration: 3.509s, episode steps: 200, steps per second: 57, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.006 [-1.408, 1.622], loss: 10.883841, mean_absolute_error: 42.169952, mean_q: 84.530548


 45170/50000: episode: 277, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.002 [-1.309, 1.267], loss: 9.506698, mean_absolute_error: 41.821323, mean_q: 84.023811


 45370/50000: episode: 278, duration: 3.346s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.002 [-0.913, 0.990], loss: 9.051252, mean_absolute_error: 42.218781, mean_q: 84.838943


 45570/50000: episode: 279, duration: 3.341s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.002 [-1.828, 1.766], loss: 10.202212, mean_absolute_error: 42.146397, mean_q: 84.754631


 45770/50000: episode: 280, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.008 [-1.327, 1.531], loss: 12.226645, mean_absolute_error: 42.007118, mean_q: 84.367828


 45970/50000: episode: 281, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.010 [-1.734, 1.398], loss: 7.531800, mean_absolute_error: 42.154247, mean_q: 84.856789


 46170/50000: episode: 282, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.004 [-1.361, 1.548], loss: 13.493037, mean_absolute_error: 42.236073, mean_q: 84.664131


 46370/50000: episode: 283, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.004 [-1.181, 1.011], loss: 13.870362, mean_absolute_error: 42.079357, mean_q: 84.577774


 46570/50000: episode: 284, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.004 [-1.522, 1.512], loss: 10.039142, mean_absolute_error: 41.848923, mean_q: 84.137543


 46770/50000: episode: 285, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.002 [-1.529, 1.649], loss: 7.827116, mean_absolute_error: 42.055767, mean_q: 84.680000


 46970/50000: episode: 286, duration: 3.328s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.007 [-1.985, 1.697], loss: 12.918654, mean_absolute_error: 42.347076, mean_q: 84.727921


 47170/50000: episode: 287, duration: 3.332s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.009 [-1.556, 1.682], loss: 11.581987, mean_absolute_error: 42.114235, mean_q: 84.585663


 47370/50000: episode: 288, duration: 3.328s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.035 [-1.819, 1.781], loss: 6.345186, mean_absolute_error: 42.138222, mean_q: 84.910042


 47570/50000: episode: 289, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.009 [-1.408, 1.573], loss: 8.546060, mean_absolute_error: 42.265816, mean_q: 85.116798


 47770/50000: episode: 290, duration: 3.343s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.010 [-1.275, 1.297], loss: 7.675524, mean_absolute_error: 42.420147, mean_q: 85.441452


 47970/50000: episode: 291, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.014 [-1.306, 1.673], loss: 7.769229, mean_absolute_error: 42.559959, mean_q: 85.637321


 48170/50000: episode: 292, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.006 [-1.838, 1.884], loss: 10.006059, mean_absolute_error: 42.565319, mean_q: 85.588768


 48370/50000: episode: 293, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.011 [-1.920, 1.792], loss: 10.684458, mean_absolute_error: 43.017578, mean_q: 86.379745


 48570/50000: episode: 294, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.008 [-1.379, 1.434], loss: 18.169628, mean_absolute_error: 43.014896, mean_q: 86.142784


 48770/50000: episode: 295, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.003 [-1.771, 1.909], loss: 7.223823, mean_absolute_error: 42.662354, mean_q: 85.752258


 48970/50000: episode: 296, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.001 [-1.363, 1.713], loss: 7.973239, mean_absolute_error: 42.733906, mean_q: 86.008553


 49170/50000: episode: 297, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.001 [-1.504, 1.464], loss: 13.685594, mean_absolute_error: 42.802261, mean_q: 85.908844


 49370/50000: episode: 298, duration: 3.330s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.019 [-1.480, 1.475], loss: 15.732663, mean_absolute_error: 43.034836, mean_q: 86.344002


 49570/50000: episode: 299, duration: 3.329s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.015 [-1.302, 1.442], loss: 7.218863, mean_absolute_error: 42.938053, mean_q: 86.451942


 49770/50000: episode: 300, duration: 3.343s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.095 [-1.456, 1.573], loss: 12.734009, mean_absolute_error: 42.809364, mean_q: 86.152306


 49970/50000: episode: 301, duration: 3.331s, episode steps: 200, steps per second: 60, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.044 [-1.760, 1.650], loss: 13.485715, mean_absolute_error: 43.348839, mean_q: 86.988586


done, took 835.635 seconds


<keras.callbacks.History at 0x14838973a58>

In [14]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [7]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...


Episode 1: reward: 200.000, steps: 200


AttributeError: 'NoneType' object has no attribute 'flip'

In [6]:
dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))

In [98]:
env.reset()

array([-0.00417311, -0.03291649,  0.04090877,  0.01345695])

In [79]:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
print(action, observation, reward, done, info)

0 [ 0.03676252 -0.20705747  0.04376185  0.31327263] 1.0 False {}


In [97]:
for i in range(20):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    env.render()
    print(action, observation, reward, done, info)
    time.sleep(1)
env.render(close=True)

1 [ 0.01944849  0.22771966  0.01550582 -0.32227389] 1.0 False {}


1 [ 0.02400288  0.42261741  0.00906034 -0.61002684] 1.0 False {}


0 [ 0.03245523  0.22736999 -0.00314019 -0.31450403] 1.0 False {}


0 [ 0.03700263  0.03229291 -0.00943027 -0.02281307] 1.0 False {}


0 [ 0.03764849 -0.16269254 -0.00988654  0.26687964] 1.0 False {}


0 [ 0.03439464 -0.357672   -0.00454894  0.55642795] 1.0 False {}


1 [ 0.0272412  -0.16248648  0.00657962  0.26231532] 1.0 False {}


0 [ 0.02399147 -0.35770173  0.01182592  0.55706625] 1.0 False {}


1 [ 0.01683744 -0.16274778  0.02296725  0.26813251] 1.0 False {}


0 [ 0.01358248 -0.35818985  0.0283299   0.56797007] 1.0 False {}


1 [ 0.00641868 -0.16347649  0.0396893   0.2843451 ] 1.0 False {}


1 [ 0.00314915  0.03105758  0.0453762   0.00443939] 1.0 False {}


0 [ 0.0037703  -0.16468477  0.04546499  0.31108672] 1.0 False {}


0 [  4.76608807e-04  -3.60423983e-01   5.16867232e-02   6.17754236e-01] 1.0 False {}


1 [-0.00673187 -0.1660607   0.06404181  0.34178779] 1.0 False {}


1 [-0.01005308  0.02809441  0.07087756  0.06996756] 1.0 False {}


0 [-0.0094912  -0.16796834  0.07227691  0.38414385] 1.0 False {}


1 [-0.01285056  0.02605703  0.07995979  0.11509658] 1.0 False {}


1 [-0.01232942  0.21994764  0.08226172 -0.15132696] 1.0 False {}


1 [-0.00793047  0.41380122  0.07923518 -0.41696635] 1.0 False {}


In [94]:
np.random.rand()

0.6700301041931712

In [103]:
env.render()

In [104]:
env.render(close=True)

In [102]:
env = gym.make('CartPole-v0')
for i_episode in range(1):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(action, reward, done)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

[2017-10-23 15:46:04,848] Making new env: CartPole-v0


[ 0.00782149  0.01473252 -0.00262509 -0.03426497]
1 1.0 False
[ 0.00811614  0.20989202 -0.00331039 -0.32777498]
1 1.0 False
[ 0.01231398  0.40506094 -0.00986589 -0.62150002]
0 1.0 False
[ 0.0204152   0.21007814 -0.02229589 -0.33194058]
1 1.0 False
[ 0.02461676  0.40551024 -0.0289347  -0.63157031]
0 1.0 False
[ 0.03272697  0.21080369 -0.04156611 -0.34813835]
1 1.0 False
[ 0.03694304  0.40649143 -0.04852887 -0.65363335]
1 1.0 False
[ 0.04507287  0.60225434 -0.06160154 -0.96119389]
0 1.0 False
[ 0.05711795  0.40801199 -0.08082542 -0.68848241]
0 1.0 False
[ 0.06527819  0.2140993  -0.09459507 -0.42229976]
1 1.0 False
[ 0.06956018  0.41042521 -0.10304106 -0.74324152]
1 1.0 False
[ 0.07776868  0.60680702 -0.11790589 -1.06649231]
1 1.0 False
[ 0.08990483  0.80327483 -0.13923574 -1.39373203]
1 

1.0 False
[ 0.10597032  0.99982788 -0.16711038 -1.72651152]
0 1.0 False
[ 0.12596688  0.80696488 -0.20164061 -1.49014913]
1 1.0 True
Episode finished after 15 timesteps


In [None]:
env = gym.Env()