In [2]:
import numpy as np
import gym
import time

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [3]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[2017-10-23 17:25:59,059] Making new env: CartPole-v0


In [18]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_17 (Dense)             (None, 16)                80        
_________________________________________________________________
activation_17 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_18 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_19 (Activation)   (None, 16)                0         
__________

In [19]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [20]:
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...




    12/50000: episode: 1, duration: 0.858s, episode steps: 12, steps per second: 14, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.141 [-0.739, 1.362], loss: 0.675215, mean_absolute_error: 0.642794, mean_q: 0.148291
    26/50000: episode: 2, duration: 0.038s, episode steps: 14, steps per second: 365, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000], mean observation: -0.080 [-2.083, 1.234], loss: 0.586718, mean_absolute_error: 0.595457, mean_q: 0.168224
    41/50000: episode: 3, duration: 0.041s, episode steps: 15, steps per second: 369, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.107 [-2.370, 1.356], loss: 0.515500, mean_absolute_error: 0.604960, mean_q: 0.263367
    60/50000: episode: 4, duration: 0.051s, episode steps: 19, steps per second: 376, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000],



    98/50000: episode: 6, duration: 0.060s, episode steps: 20, steps per second: 335, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.081 [-0.753, 1.537], loss: 0.097278, mean_absolute_error: 0.697833, mean_q: 1.129900
   110/50000: episode: 7, duration: 0.034s, episode steps: 12, steps per second: 351, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.105 [-1.554, 0.975], loss: 0.059895, mean_absolute_error: 0.807912, mean_q: 1.402828
   126/50000: episode: 8, duration: 0.044s, episode steps: 16, steps per second: 365, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.688 [0.000, 1.000], mean observation: -0.056 [-2.351, 1.607], loss: 0.048785, mean_absolute_error: 0.825377, mean_q: 1.433264


   179/50000: episode: 9, duration: 0.140s, episode steps: 53, steps per second: 379, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.151 [-1.058, 1.746], loss: 0.049072, mean_absolute_error: 0.852755, mean_q: 1.533229
   194/50000: episode: 10, duration: 0.043s, episode steps: 15, steps per second: 349, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.101 [-2.333, 1.349], loss: 0.043415, mean_absolute_error: 0.967808, mean_q: 1.787156
   215/50000: episode: 11, duration: 0.056s, episode steps: 21, steps per second: 373, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.074 [-1.334, 2.290], loss: 0.054724, mean_absolute_error: 1.060877, mean_q: 1.996273
   234/50000: episode: 12, duration: 0.051s, episode steps: 19, steps per second: 374, episode reward: 19.000, mean reward: 1.000 [1.000, 1.00

   260/50000: episode: 14, duration: 0.044s, episode steps: 15, steps per second: 338, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.090 [-0.997, 1.852], loss: 0.082261, mean_absolute_error: 1.222561, mean_q: 2.255116
   275/50000: episode: 15, duration: 0.046s, episode steps: 15, steps per second: 325, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.106 [-1.175, 2.082], loss: 0.104481, mean_absolute_error: 1.304355, mean_q: 2.436423
   284/50000: episode: 16, duration: 0.027s, episode steps: 9, steps per second: 338, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.778 [0.000, 1.000], mean observation: -0.133 [-1.933, 1.194], loss: 0.079913, mean_absolute_error: 1.341383, mean_q: 2.538384
   301/50000: episode: 17, duration: 0.045s, episode steps: 17, steps per second: 376, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000

   333/50000: episode: 19, duration: 0.040s, episode steps: 14, steps per second: 348, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.286 [0.000, 1.000], mean observation: 0.100 [-1.176, 2.021], loss: 0.145078, mean_absolute_error: 1.541742, mean_q: 2.907894
   370/50000: episode: 20, duration: 0.102s, episode steps: 37, steps per second: 364, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.156 [-0.372, 0.716], loss: 0.138632, mean_absolute_error: 1.640514, mean_q: 3.106411
   390/50000: episode: 21, duration: 0.053s, episode steps: 20, steps per second: 376, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.089 [-1.152, 0.768], loss: 0.140972, mean_absolute_error: 1.759237, mean_q: 3.374100
   399/50000: episode: 22, duration: 0.026s, episode steps: 9, steps per second: 352, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000

   416/50000: episode: 23, duration: 0.049s, episode steps: 17, steps per second: 346, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.706 [0.000, 1.000], mean observation: -0.057 [-2.399, 1.576], loss: 0.151134, mean_absolute_error: 1.893618, mean_q: 3.630559
   436/50000: episode: 24, duration: 0.056s, episode steps: 20, steps per second: 359, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.034 [-1.191, 1.896], loss: 0.217022, mean_absolute_error: 1.960384, mean_q: 3.663027
   447/50000: episode: 25, duration: 0.031s, episode steps: 11, steps per second: 352, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.108 [-0.766, 1.234], loss: 0.150577, mean_absolute_error: 2.023152, mean_q: 3.939039
   458/50000: episode: 26, duration: 0.030s, episode steps: 11, steps per second: 363, episode reward: 11.000, mean reward: 1.000 [1.000, 1.0

   526/50000: episode: 28, duration: 0.136s, episode steps: 50, steps per second: 369, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.440 [0.000, 1.000], mean observation: 0.056 [-1.328, 2.247], loss: 0.213590, mean_absolute_error: 2.280102, mean_q: 4.345592
   539/50000: episode: 29, duration: 0.036s, episode steps: 13, steps per second: 366, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.615 [0.000, 1.000], mean observation: -0.094 [-1.618, 1.026], loss: 0.178178, mean_absolute_error: 2.363559, mean_q: 4.519340
   555/50000: episode: 30, duration: 0.043s, episode steps: 16, steps per second: 371, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.066 [-1.203, 1.769], loss: 0.303893, mean_absolute_error: 2.495794, mean_q: 4.765454
   566/50000: episode: 31, duration: 0.030s, episode steps: 11, steps per second: 364, episode reward: 11.000, mean reward: 1.000 [1.000, 1.0

   620/50000: episode: 33, duration: 0.118s, episode steps: 42, steps per second: 356, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.019 [-1.517, 0.846], loss: 0.321394, mean_absolute_error: 2.704954, mean_q: 5.148561
   635/50000: episode: 34, duration: 0.044s, episode steps: 15, steps per second: 343, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.103 [-1.016, 1.925], loss: 0.327106, mean_absolute_error: 2.814517, mean_q: 5.296378
   676/50000: episode: 35, duration: 0.107s, episode steps: 41, steps per second: 384, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.341 [0.000, 1.000], mean observation: -0.068 [-2.524, 3.117], loss: 0.282797, mean_absolute_error: 2.910530, mean_q: 5.536137
   690/50000: episode: 36, duration: 0.038s, episode steps: 14, steps per second: 365, episode reward: 14.000, mean reward: 1.000 [1.000, 1.

   714/50000: episode: 37, duration: 0.071s, episode steps: 24, steps per second: 338, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.085 [-1.170, 2.260], loss: 0.232339, mean_absolute_error: 3.050893, mean_q: 5.850178
   735/50000: episode: 38, duration: 0.057s, episode steps: 21, steps per second: 371, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.081 [-1.084, 0.591], loss: 0.305301, mean_absolute_error: 3.136000, mean_q: 5.989413
   759/50000: episode: 39, duration: 0.064s, episode steps: 24, steps per second: 376, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.078 [-0.816, 1.723], loss: 0.333981, mean_absolute_error: 3.239864, mean_q: 6.153958
   776/50000: episode: 40, duration: 0.049s, episode steps: 17, steps per second: 348, episode reward: 17.000, mean reward: 1.000 [1.000, 1.0

   808/50000: episode: 41, duration: 0.093s, episode steps: 32, steps per second: 343, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.056 [-0.646, 1.159], loss: 0.417199, mean_absolute_error: 3.384929, mean_q: 6.439037
   828/50000: episode: 42, duration: 0.054s, episode steps: 20, steps per second: 369, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.042 [-1.698, 1.025], loss: 0.397198, mean_absolute_error: 3.456669, mean_q: 6.547513
   851/50000: episode: 43, duration: 0.061s, episode steps: 23, steps per second: 377, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: 0.053 [-1.125, 1.611], loss: 0.357490, mean_absolute_error: 3.549908, mean_q: 6.764911
   873/50000: episode: 44, duration: 0.059s, episode steps: 22, steps per second: 372, episode reward: 22.000, mean reward: 1.000 [1.000, 1.0

   888/50000: episode: 45, duration: 0.042s, episode steps: 15, steps per second: 356, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.122 [-0.567, 1.329], loss: 0.332243, mean_absolute_error: 3.699020, mean_q: 7.126103
   910/50000: episode: 46, duration: 0.063s, episode steps: 22, steps per second: 350, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.060 [-1.272, 0.642], loss: 0.336223, mean_absolute_error: 3.778074, mean_q: 7.337976


   968/50000: episode: 47, duration: 0.174s, episode steps: 58, steps per second: 334, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.001 [-1.382, 1.684], loss: 0.332679, mean_absolute_error: 3.905781, mean_q: 7.655163
   989/50000: episode: 48, duration: 0.058s, episode steps: 21, steps per second: 364, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.060 [-0.990, 1.646], loss: 0.313752, mean_absolute_error: 4.065966, mean_q: 8.019331
  1001/50000: episode: 49, duration: 0.034s, episode steps: 12, steps per second: 356, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: -0.125 [-1.411, 0.768], loss: 0.252320, mean_absolute_error: 4.063694, mean_q: 8.038653
  1017/50000: episode: 50, duration: 0.044s, episode steps: 16, steps per second: 362, episode reward: 16.000, mean reward: 1.000 [1.000, 1.

  1064/50000: episode: 51, duration: 0.130s, episode steps: 47, steps per second: 362, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.143 [-1.045, 0.559], loss: 0.365389, mean_absolute_error: 4.261187, mean_q: 8.417190
  1101/50000: episode: 52, duration: 0.099s, episode steps: 37, steps per second: 375, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.082 [-1.557, 0.938], loss: 0.416002, mean_absolute_error: 4.436347, mean_q: 8.718307


  1185/50000: episode: 53, duration: 0.230s, episode steps: 84, steps per second: 366, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.142 [-1.315, 0.816], loss: 0.481500, mean_absolute_error: 4.670493, mean_q: 9.172182
  1224/50000: episode: 54, duration: 0.103s, episode steps: 39, steps per second: 378, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.083 [-1.004, 0.429], loss: 0.389043, mean_absolute_error: 4.934829, mean_q: 9.817523


  1288/50000: episode: 55, duration: 0.167s, episode steps: 64, steps per second: 383, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.187 [-1.701, 0.615], loss: 0.410338, mean_absolute_error: 5.134718, mean_q: 10.222794
  1364/50000: episode: 56, duration: 0.197s, episode steps: 76, steps per second: 385, episode reward: 76.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.167 [-1.154, 1.091], loss: 0.512340, mean_absolute_error: 5.343684, mean_q: 10.623379


  1390/50000: episode: 57, duration: 0.071s, episode steps: 26, steps per second: 367, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.051 [-0.758, 1.335], loss: 0.326926, mean_absolute_error: 5.551072, mean_q: 11.156128


  1516/50000: episode: 58, duration: 0.327s, episode steps: 126, steps per second: 385, episode reward: 126.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.054 [-1.565, 1.638], loss: 0.432381, mean_absolute_error: 5.892217, mean_q: 11.777811
  1541/50000: episode: 59, duration: 0.067s, episode steps: 25, steps per second: 371, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.073 [-1.016, 1.429], loss: 0.454068, mean_absolute_error: 6.214390, mean_q: 12.466532


  1629/50000: episode: 60, duration: 0.227s, episode steps: 88, steps per second: 387, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.219 [-1.540, 1.070], loss: 0.534623, mean_absolute_error: 6.431152, mean_q: 12.847071


  1724/50000: episode: 61, duration: 0.245s, episode steps: 95, steps per second: 387, episode reward: 95.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.307 [-1.302, 0.754], loss: 0.499010, mean_absolute_error: 6.778762, mean_q: 13.639145


  1860/50000: episode: 62, duration: 0.353s, episode steps: 136, steps per second: 385, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.174 [-1.272, 0.893], loss: 0.533996, mean_absolute_error: 7.302582, mean_q: 14.784121


  1983/50000: episode: 63, duration: 0.316s, episode steps: 123, steps per second: 389, episode reward: 123.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.204 [-1.655, 1.327], loss: 0.652637, mean_absolute_error: 7.879586, mean_q: 15.927598


  2091/50000: episode: 64, duration: 0.279s, episode steps: 108, steps per second: 388, episode reward: 108.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.270 [-1.535, 1.441], loss: 0.836307, mean_absolute_error: 8.331722, mean_q: 16.828325


  2239/50000: episode: 65, duration: 0.390s, episode steps: 148, steps per second: 379, episode reward: 148.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.200 [-1.640, 1.372], loss: 0.706383, mean_absolute_error: 8.864914, mean_q: 18.000479


  2378/50000: episode: 66, duration: 0.359s, episode steps: 139, steps per second: 387, episode reward: 139.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.402 [-2.585, 0.764], loss: 0.754848, mean_absolute_error: 9.511630, mean_q: 19.341877


  2578/50000: episode: 67, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.117 [-1.094, 0.792], loss: 0.689566, mean_absolute_error: 10.298049, mean_q: 20.984434


  2740/50000: episode: 68, duration: 0.419s, episode steps: 162, steps per second: 386, episode reward: 162.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.241 [-1.670, 1.024], loss: 0.857411, mean_absolute_error: 11.133391, mean_q: 22.629063


  2877/50000: episode: 69, duration: 0.354s, episode steps: 137, steps per second: 387, episode reward: 137.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.228 [-1.256, 1.097], loss: 1.016561, mean_absolute_error: 11.762638, mean_q: 23.874596


  3056/50000: episode: 70, duration: 0.459s, episode steps: 179, steps per second: 390, episode reward: 179.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.282 [-2.274, 1.060], loss: 1.115694, mean_absolute_error: 12.388538, mean_q: 25.166655


  3239/50000: episode: 71, duration: 0.471s, episode steps: 183, steps per second: 388, episode reward: 183.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.327 [-2.370, 0.963], loss: 1.304632, mean_absolute_error: 13.044406, mean_q: 26.445499


  3375/50000: episode: 72, duration: 0.356s, episode steps: 136, steps per second: 383, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.338 [-1.886, 1.003], loss: 1.114003, mean_absolute_error: 13.734120, mean_q: 27.972977


  3575/50000: episode: 73, duration: 0.529s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.237 [-2.187, 1.138], loss: 1.430974, mean_absolute_error: 14.407236, mean_q: 29.353506


  3775/50000: episode: 74, duration: 0.530s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.251 [-2.020, 1.231], loss: 1.524244, mean_absolute_error: 15.243882, mean_q: 31.039814


  3975/50000: episode: 75, duration: 0.536s, episode steps: 200, steps per second: 373, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.002 [-1.129, 1.239], loss: 1.371257, mean_absolute_error: 16.058174, mean_q: 32.744144


  4141/50000: episode: 76, duration: 0.459s, episode steps: 166, steps per second: 362, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.399 [-2.571, 1.163], loss: 1.470664, mean_absolute_error: 16.815580, mean_q: 34.300041


  4341/50000: episode: 77, duration: 0.528s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.129 [-1.467, 1.192], loss: 1.719493, mean_absolute_error: 17.604122, mean_q: 35.813011


  4518/50000: episode: 78, duration: 0.463s, episode steps: 177, steps per second: 382, episode reward: 177.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.381 [-2.413, 1.004], loss: 1.858509, mean_absolute_error: 18.239182, mean_q: 37.120205


  4690/50000: episode: 79, duration: 0.445s, episode steps: 172, steps per second: 386, episode reward: 172.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.394 [-2.413, 0.873], loss: 2.001219, mean_absolute_error: 19.075918, mean_q: 38.739990


  4882/50000: episode: 80, duration: 0.514s, episode steps: 192, steps per second: 373, episode reward: 192.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.372 [-2.575, 0.966], loss: 1.719070, mean_absolute_error: 19.613602, mean_q: 39.966175


  5069/50000: episode: 81, duration: 0.495s, episode steps: 187, steps per second: 378, episode reward: 187.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.311 [-2.009, 1.131], loss: 1.748614, mean_absolute_error: 20.442686, mean_q: 41.607857


  5264/50000: episode: 82, duration: 0.507s, episode steps: 195, steps per second: 385, episode reward: 195.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.347 [-2.417, 1.008], loss: 2.371797, mean_absolute_error: 21.149580, mean_q: 42.906902


  5464/50000: episode: 83, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.308 [-2.380, 1.311], loss: 2.104731, mean_absolute_error: 21.815414, mean_q: 44.297916


  5664/50000: episode: 84, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.227 [-1.512, 0.865], loss: 2.233350, mean_absolute_error: 22.507889, mean_q: 45.630878


  5864/50000: episode: 85, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.272 [-1.778, 0.929], loss: 1.958968, mean_absolute_error: 23.387798, mean_q: 47.412186


  6059/50000: episode: 86, duration: 0.504s, episode steps: 195, steps per second: 387, episode reward: 195.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.357 [-2.405, 1.009], loss: 2.779570, mean_absolute_error: 23.790545, mean_q: 48.282166


  6259/50000: episode: 87, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.293 [-2.089, 1.188], loss: 1.958996, mean_absolute_error: 24.491327, mean_q: 49.704845


  6459/50000: episode: 88, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.275 [-1.773, 0.970], loss: 3.082646, mean_absolute_error: 25.088734, mean_q: 50.804527


  6659/50000: episode: 89, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.302 [-1.947, 0.997], loss: 2.610958, mean_absolute_error: 25.682495, mean_q: 52.081974


  6859/50000: episode: 90, duration: 0.531s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.341 [-2.275, 1.001], loss: 2.751452, mean_absolute_error: 26.393152, mean_q: 53.441586


  7059/50000: episode: 91, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.234 [-1.615, 1.328], loss: 2.939220, mean_absolute_error: 26.897846, mean_q: 54.473125


  7259/50000: episode: 92, duration: 0.524s, episode steps: 200, steps per second: 382, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.334 [-2.268, 1.358], loss: 2.750575, mean_absolute_error: 27.448580, mean_q: 55.616581


  7459/50000: episode: 93, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.328 [-2.203, 0.953], loss: 2.749203, mean_absolute_error: 27.896265, mean_q: 56.373535


  7659/50000: episode: 94, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.105 [-1.274, 0.978], loss: 3.099206, mean_absolute_error: 28.347412, mean_q: 57.431007


  7859/50000: episode: 95, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.318 [-2.143, 0.945], loss: 3.145916, mean_absolute_error: 28.921553, mean_q: 58.447647


  8041/50000: episode: 96, duration: 0.471s, episode steps: 182, steps per second: 387, episode reward: 182.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.385 [-2.405, 1.084], loss: 3.570346, mean_absolute_error: 29.204445, mean_q: 58.995781


  8241/50000: episode: 97, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.261 [-1.846, 1.125], loss: 2.666545, mean_absolute_error: 29.663475, mean_q: 60.032192


  8441/50000: episode: 98, duration: 0.523s, episode steps: 200, steps per second: 382, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.258 [-2.256, 1.428], loss: 2.121795, mean_absolute_error: 30.102974, mean_q: 60.964844


  8618/50000: episode: 99, duration: 0.456s, episode steps: 177, steps per second: 388, episode reward: 177.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.395 [-2.409, 1.172], loss: 3.415417, mean_absolute_error: 30.461235, mean_q: 61.565426


  8804/50000: episode: 100, duration: 0.480s, episode steps: 186, steps per second: 387, episode reward: 186.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.376 [-2.416, 1.341], loss: 3.081212, mean_absolute_error: 30.993023, mean_q: 62.642368


  9004/50000: episode: 101, duration: 0.515s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.286 [-1.982, 1.083], loss: 2.716688, mean_absolute_error: 31.127588, mean_q: 62.796524


  9204/50000: episode: 102, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.273 [-2.243, 1.521], loss: 3.919495, mean_absolute_error: 31.379692, mean_q: 63.296555


  9404/50000: episode: 103, duration: 0.512s, episode steps: 200, steps per second: 391, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.223 [-1.691, 1.238], loss: 2.759647, mean_absolute_error: 31.623291, mean_q: 63.804630


  9592/50000: episode: 104, duration: 0.485s, episode steps: 188, steps per second: 388, episode reward: 188.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.372 [-2.405, 1.226], loss: 2.324176, mean_absolute_error: 32.117935, mean_q: 64.923721


  9792/50000: episode: 105, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.343 [-2.610, 1.272], loss: 3.271887, mean_absolute_error: 32.367985, mean_q: 65.345985


  9992/50000: episode: 106, duration: 0.519s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.263 [-1.830, 1.035], loss: 2.583184, mean_absolute_error: 32.648438, mean_q: 65.906265


 10192/50000: episode: 107, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.272 [-2.198, 1.330], loss: 2.769201, mean_absolute_error: 32.848473, mean_q: 66.309082


 10392/50000: episode: 108, duration: 0.516s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.241 [-1.661, 1.308], loss: 2.871730, mean_absolute_error: 33.184792, mean_q: 66.958961


 10577/50000: episode: 109, duration: 0.485s, episode steps: 185, steps per second: 382, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.365 [-2.402, 1.440], loss: 2.273991, mean_absolute_error: 33.328335, mean_q: 67.197891


 10771/50000: episode: 110, duration: 0.506s, episode steps: 194, steps per second: 384, episode reward: 194.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.356 [-2.424, 1.229], loss: 1.365669, mean_absolute_error: 33.727741, mean_q: 68.146317


 10971/50000: episode: 111, duration: 0.524s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.339 [-2.381, 1.395], loss: 3.710404, mean_absolute_error: 33.667049, mean_q: 67.904686


 11171/50000: episode: 112, duration: 0.527s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.255 [-2.017, 1.270], loss: 2.249612, mean_absolute_error: 33.876652, mean_q: 68.340408


 11364/50000: episode: 113, duration: 0.498s, episode steps: 193, steps per second: 388, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.348 [-2.451, 1.246], loss: 2.351685, mean_absolute_error: 34.209282, mean_q: 68.999069


 11560/50000: episode: 114, duration: 0.508s, episode steps: 196, steps per second: 386, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.356 [-2.463, 1.423], loss: 1.954755, mean_absolute_error: 34.175980, mean_q: 68.981033


 11760/50000: episode: 115, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.099 [-1.167, 1.063], loss: 2.269925, mean_absolute_error: 34.376228, mean_q: 69.337585


 11940/50000: episode: 116, duration: 0.479s, episode steps: 180, steps per second: 376, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.366 [-2.602, 1.394], loss: 1.869997, mean_absolute_error: 34.730907, mean_q: 70.084480


 12140/50000: episode: 117, duration: 0.531s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.249 [-2.451, 1.354], loss: 1.670432, mean_absolute_error: 34.688446, mean_q: 69.968605


 12324/50000: episode: 118, duration: 0.480s, episode steps: 184, steps per second: 383, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.365 [-2.244, 1.573], loss: 1.955874, mean_absolute_error: 34.849831, mean_q: 70.171890


 12510/50000: episode: 119, duration: 0.492s, episode steps: 186, steps per second: 378, episode reward: 186.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.324 [-2.572, 1.527], loss: 2.956099, mean_absolute_error: 35.062057, mean_q: 70.670731


 12710/50000: episode: 120, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.284 [-2.290, 1.563], loss: 2.637482, mean_absolute_error: 34.987690, mean_q: 70.453094


 12910/50000: episode: 121, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.145 [-0.977, 1.472], loss: 1.848301, mean_absolute_error: 35.046432, mean_q: 70.749641


 13082/50000: episode: 122, duration: 0.453s, episode steps: 172, steps per second: 380, episode reward: 172.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.334 [-2.091, 1.261], loss: 1.745373, mean_absolute_error: 35.628132, mean_q: 71.778206


 13282/50000: episode: 123, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.008 [-1.080, 1.108], loss: 2.831108, mean_absolute_error: 35.720211, mean_q: 71.831375


 13482/50000: episode: 124, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.052 [-1.184, 1.151], loss: 1.814717, mean_absolute_error: 36.004948, mean_q: 72.534286


 13682/50000: episode: 125, duration: 0.529s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.163 [-1.641, 1.527], loss: 2.975882, mean_absolute_error: 36.188667, mean_q: 72.783386


 13882/50000: episode: 126, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.053 [-1.285, 1.182], loss: 2.656871, mean_absolute_error: 36.470409, mean_q: 73.403000


 14082/50000: episode: 127, duration: 0.515s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.174 [-1.846, 1.634], loss: 3.503020, mean_absolute_error: 36.780762, mean_q: 73.873283


 14282/50000: episode: 128, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.069 [-1.580, 1.386], loss: 5.276283, mean_absolute_error: 36.703171, mean_q: 73.685822


 14482/50000: episode: 129, duration: 0.513s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.297 [-1.035, 2.019], loss: 3.115714, mean_absolute_error: 36.847809, mean_q: 74.027306


 14682/50000: episode: 130, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.018 [-1.535, 1.324], loss: 4.707676, mean_absolute_error: 36.949005, mean_q: 74.299667


 14882/50000: episode: 131, duration: 0.523s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.035 [-1.278, 1.164], loss: 3.494853, mean_absolute_error: 37.263260, mean_q: 74.909279


 15082/50000: episode: 132, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.016 [-1.411, 1.278], loss: 6.942764, mean_absolute_error: 37.374447, mean_q: 75.076157


 15282/50000: episode: 133, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.004 [-1.268, 1.336], loss: 4.318356, mean_absolute_error: 37.374336, mean_q: 75.100159


 15482/50000: episode: 134, duration: 0.512s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.006 [-1.439, 1.321], loss: 3.561343, mean_absolute_error: 37.781071, mean_q: 75.942413


 15674/50000: episode: 135, duration: 0.495s, episode steps: 192, steps per second: 388, episode reward: 192.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.256 [-1.409, 1.802], loss: 5.188531, mean_absolute_error: 38.003132, mean_q: 76.165466


 15874/50000: episode: 136, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.109 [-0.946, 1.291], loss: 1.685904, mean_absolute_error: 38.214161, mean_q: 76.875008


 16074/50000: episode: 137, duration: 0.512s, episode steps: 200, steps per second: 391, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.196 [-1.205, 1.318], loss: 4.563233, mean_absolute_error: 38.395206, mean_q: 77.165863


 16257/50000: episode: 138, duration: 0.474s, episode steps: 183, steps per second: 386, episode reward: 183.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.297 [-1.224, 1.896], loss: 6.457265, mean_absolute_error: 39.001110, mean_q: 78.303474


 16343/50000: episode: 139, duration: 0.225s, episode steps: 86, steps per second: 383, episode reward: 86.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.164 [-2.264, 1.886], loss: 15.131557, mean_absolute_error: 39.020603, mean_q: 77.792488


 16543/50000: episode: 140, duration: 0.519s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.225 [-1.921, 1.820], loss: 3.641752, mean_absolute_error: 38.889229, mean_q: 78.098434


 16743/50000: episode: 141, duration: 0.519s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.059 [-1.456, 1.663], loss: 3.751797, mean_absolute_error: 39.063332, mean_q: 78.481415


 16936/50000: episode: 142, duration: 0.507s, episode steps: 193, steps per second: 381, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.290 [-1.231, 1.834], loss: 6.029430, mean_absolute_error: 39.438210, mean_q: 79.248245


 17136/50000: episode: 143, duration: 0.536s, episode steps: 200, steps per second: 373, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.340 [-2.106, 1.873], loss: 8.863396, mean_absolute_error: 39.901093, mean_q: 80.090431


 17336/50000: episode: 144, duration: 0.524s, episode steps: 200, steps per second: 382, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.176 [-1.437, 1.614], loss: 5.672781, mean_absolute_error: 40.017323, mean_q: 80.450974


 17536/50000: episode: 145, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.069 [-1.276, 1.441], loss: 10.721572, mean_absolute_error: 40.536045, mean_q: 81.503761


 17736/50000: episode: 146, duration: 0.532s, episode steps: 200, steps per second: 376, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.040 [-1.761, 1.971], loss: 8.186724, mean_absolute_error: 40.516281, mean_q: 81.410645


 17927/50000: episode: 147, duration: 0.538s, episode steps: 191, steps per second: 355, episode reward: 191.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.285 [-1.476, 1.903], loss: 9.920287, mean_absolute_error: 40.820370, mean_q: 82.131157


 18107/50000: episode: 148, duration: 0.497s, episode steps: 180, steps per second: 362, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.331 [-1.207, 2.008], loss: 7.659863, mean_absolute_error: 41.260056, mean_q: 83.132423


 18307/50000: episode: 149, duration: 0.520s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.051 [-1.505, 1.524], loss: 5.504931, mean_absolute_error: 41.509865, mean_q: 83.754921


 18507/50000: episode: 150, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.281 [-1.083, 1.814], loss: 10.068687, mean_absolute_error: 42.259571, mean_q: 84.993263


 18707/50000: episode: 151, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.139 [-1.156, 1.358], loss: 8.308576, mean_absolute_error: 42.483971, mean_q: 85.560852


 18907/50000: episode: 152, duration: 0.525s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.045 [-1.727, 1.901], loss: 9.008581, mean_absolute_error: 42.765751, mean_q: 85.992264


 19107/50000: episode: 153, duration: 0.551s, episode steps: 200, steps per second: 363, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.066 [-1.735, 1.716], loss: 6.804813, mean_absolute_error: 43.110489, mean_q: 86.706505


 19307/50000: episode: 154, duration: 0.530s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.287 [-1.743, 1.436], loss: 11.094924, mean_absolute_error: 43.478127, mean_q: 87.315880


 19507/50000: episode: 155, duration: 0.542s, episode steps: 200, steps per second: 369, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.303 [-2.103, 1.847], loss: 9.681936, mean_absolute_error: 43.313332, mean_q: 87.207054


 19707/50000: episode: 156, duration: 0.561s, episode steps: 200, steps per second: 357, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.226 [-1.724, 1.532], loss: 9.162357, mean_absolute_error: 43.590889, mean_q: 87.673485


 19907/50000: episode: 157, duration: 0.639s, episode steps: 200, steps per second: 313, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.372 [-2.390, 1.852], loss: 4.793701, mean_absolute_error: 43.725006, mean_q: 88.147881


 20107/50000: episode: 158, duration: 0.593s, episode steps: 200, steps per second: 337, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.339 [-2.175, 1.564], loss: 8.753746, mean_absolute_error: 43.906540, mean_q: 88.221054


 20307/50000: episode: 159, duration: 0.616s, episode steps: 200, steps per second: 325, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.281 [-2.031, 1.795], loss: 11.424387, mean_absolute_error: 43.917500, mean_q: 88.197029


 20507/50000: episode: 160, duration: 0.590s, episode steps: 200, steps per second: 339, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.335 [-1.870, 1.305], loss: 10.472796, mean_absolute_error: 43.799751, mean_q: 88.195778


 20707/50000: episode: 161, duration: 0.580s, episode steps: 200, steps per second: 345, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.307 [-1.871, 1.657], loss: 7.506404, mean_absolute_error: 43.990078, mean_q: 88.710915


 20907/50000: episode: 162, duration: 0.554s, episode steps: 200, steps per second: 361, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.293 [-1.925, 1.657], loss: 7.976143, mean_absolute_error: 44.064873, mean_q: 88.766579


 21107/50000: episode: 163, duration: 0.556s, episode steps: 200, steps per second: 359, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.225 [-1.894, 1.555], loss: 11.017331, mean_absolute_error: 44.136456, mean_q: 88.811127


 21307/50000: episode: 164, duration: 0.550s, episode steps: 200, steps per second: 364, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.204 [-1.548, 1.490], loss: 5.911118, mean_absolute_error: 43.867821, mean_q: 88.457283


 21507/50000: episode: 165, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.135 [-1.946, 1.751], loss: 10.036362, mean_absolute_error: 43.888397, mean_q: 88.310608


 21707/50000: episode: 166, duration: 0.517s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.238 [-2.079, 1.985], loss: 11.477006, mean_absolute_error: 43.908730, mean_q: 88.205750


 21907/50000: episode: 167, duration: 0.565s, episode steps: 200, steps per second: 354, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-1.526, 1.388], loss: 6.623453, mean_absolute_error: 43.551170, mean_q: 87.814301


 22107/50000: episode: 168, duration: 0.593s, episode steps: 200, steps per second: 337, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.098 [-1.855, 1.830], loss: 7.518648, mean_absolute_error: 43.709549, mean_q: 87.995117


 22307/50000: episode: 169, duration: 0.578s, episode steps: 200, steps per second: 346, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.080 [-1.691, 1.753], loss: 9.719325, mean_absolute_error: 43.591209, mean_q: 87.887131


 22507/50000: episode: 170, duration: 0.586s, episode steps: 200, steps per second: 341, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.039 [-1.180, 1.253], loss: 9.592170, mean_absolute_error: 43.864468, mean_q: 88.319260


 22707/50000: episode: 171, duration: 0.628s, episode steps: 200, steps per second: 318, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.225 [-1.849, 1.800], loss: 10.458418, mean_absolute_error: 43.751328, mean_q: 87.931389


 22907/50000: episode: 172, duration: 0.558s, episode steps: 200, steps per second: 359, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.254 [-1.849, 1.634], loss: 8.715259, mean_absolute_error: 43.482822, mean_q: 87.645218


 23107/50000: episode: 173, duration: 0.572s, episode steps: 200, steps per second: 350, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.344 [-2.155, 1.510], loss: 6.330211, mean_absolute_error: 43.439308, mean_q: 87.452644


 23307/50000: episode: 174, duration: 0.574s, episode steps: 200, steps per second: 348, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.268 [-2.032, 1.525], loss: 5.046150, mean_absolute_error: 43.163673, mean_q: 86.926407


 23507/50000: episode: 175, duration: 0.586s, episode steps: 200, steps per second: 341, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.233 [-1.919, 1.695], loss: 7.161950, mean_absolute_error: 43.155819, mean_q: 86.930710


 23707/50000: episode: 176, duration: 0.572s, episode steps: 200, steps per second: 350, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.165 [-2.058, 1.654], loss: 9.355667, mean_absolute_error: 43.016590, mean_q: 86.465424


 23907/50000: episode: 177, duration: 0.565s, episode steps: 200, steps per second: 354, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.253 [-2.090, 1.682], loss: 7.960004, mean_absolute_error: 43.207802, mean_q: 86.981621


 24107/50000: episode: 178, duration: 0.603s, episode steps: 200, steps per second: 331, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.172 [-1.720, 1.758], loss: 7.274153, mean_absolute_error: 42.749340, mean_q: 86.152382


 24307/50000: episode: 179, duration: 0.576s, episode steps: 200, steps per second: 347, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.179 [-1.892, 1.903], loss: 7.259795, mean_absolute_error: 42.914448, mean_q: 86.300896


 24507/50000: episode: 180, duration: 0.548s, episode steps: 200, steps per second: 365, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.190 [-1.474, 1.366], loss: 9.442955, mean_absolute_error: 42.555771, mean_q: 85.594299


 24707/50000: episode: 181, duration: 0.531s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.210 [-1.943, 1.689], loss: 9.840685, mean_absolute_error: 42.716164, mean_q: 85.806190


 24907/50000: episode: 182, duration: 0.533s, episode steps: 200, steps per second: 375, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.042 [-1.537, 1.484], loss: 8.790051, mean_absolute_error: 42.909168, mean_q: 86.037033


 25021/50000: episode: 183, duration: 0.317s, episode steps: 114, steps per second: 360, episode reward: 114.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.101 [-1.960, 2.009], loss: 14.627007, mean_absolute_error: 42.913857, mean_q: 85.959030


 25221/50000: episode: 184, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.256 [-1.629, 1.722], loss: 9.046131, mean_absolute_error: 42.881039, mean_q: 86.054939


 25421/50000: episode: 185, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.117 [-1.442, 1.499], loss: 5.665536, mean_absolute_error: 42.742363, mean_q: 85.961075


 25621/50000: episode: 186, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.229 [-2.062, 1.503], loss: 8.640127, mean_absolute_error: 42.264244, mean_q: 85.005409


 25821/50000: episode: 187, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.289 [-2.290, 1.510], loss: 8.729380, mean_absolute_error: 42.410690, mean_q: 85.317535


 26021/50000: episode: 188, duration: 0.520s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.013 [-1.844, 1.532], loss: 7.087723, mean_absolute_error: 42.230766, mean_q: 84.981209


 26221/50000: episode: 189, duration: 0.531s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.264 [-2.783, 1.721], loss: 10.794056, mean_absolute_error: 42.427387, mean_q: 85.331604


 26421/50000: episode: 190, duration: 0.545s, episode steps: 200, steps per second: 367, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.165 [-1.661, 1.541], loss: 7.675509, mean_absolute_error: 42.196754, mean_q: 84.783455


 26621/50000: episode: 191, duration: 0.524s, episode steps: 200, steps per second: 382, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.064 [-1.462, 1.522], loss: 8.105206, mean_absolute_error: 41.958828, mean_q: 84.523346


 26770/50000: episode: 192, duration: 0.395s, episode steps: 149, steps per second: 378, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.343 [-3.201, 2.033], loss: 9.658879, mean_absolute_error: 42.058731, mean_q: 84.532936


 26970/50000: episode: 193, duration: 0.526s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.038 [-1.663, 1.741], loss: 10.535391, mean_absolute_error: 41.826447, mean_q: 84.253082


 27170/50000: episode: 194, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.056 [-1.712, 1.397], loss: 9.044529, mean_absolute_error: 42.073963, mean_q: 84.481926


 27370/50000: episode: 195, duration: 0.519s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.116 [-1.740, 1.388], loss: 11.629202, mean_absolute_error: 41.698730, mean_q: 83.730759


 27570/50000: episode: 196, duration: 0.527s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.134 [-1.622, 1.814], loss: 5.509110, mean_absolute_error: 42.073536, mean_q: 84.630478


 27770/50000: episode: 197, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.191 [-1.088, 1.558], loss: 11.701320, mean_absolute_error: 41.487061, mean_q: 83.223190


 27970/50000: episode: 198, duration: 0.527s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.119 [-1.476, 1.367], loss: 10.317853, mean_absolute_error: 41.435341, mean_q: 83.296112


 28170/50000: episode: 199, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.297 [-3.039, 2.196], loss: 7.323558, mean_absolute_error: 41.624237, mean_q: 83.873756


 28370/50000: episode: 200, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.107 [-1.676, 1.911], loss: 12.420193, mean_absolute_error: 41.437363, mean_q: 83.367340


 28570/50000: episode: 201, duration: 0.534s, episode steps: 200, steps per second: 374, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.256 [-1.293, 1.465], loss: 7.635826, mean_absolute_error: 41.360508, mean_q: 83.419807


 28770/50000: episode: 202, duration: 0.529s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.276 [-1.202, 1.506], loss: 8.343861, mean_absolute_error: 41.657509, mean_q: 83.961639


 28970/50000: episode: 203, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.238 [-1.026, 1.249], loss: 8.317966, mean_absolute_error: 41.966019, mean_q: 84.401741


 29170/50000: episode: 204, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.193 [-1.374, 1.761], loss: 10.924490, mean_absolute_error: 41.570213, mean_q: 83.600525


 29370/50000: episode: 205, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.347 [-1.197, 1.741], loss: 12.532559, mean_absolute_error: 41.752567, mean_q: 84.020058


 29570/50000: episode: 206, duration: 0.523s, episode steps: 200, steps per second: 382, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.256 [-1.165, 1.328], loss: 6.009670, mean_absolute_error: 41.997971, mean_q: 84.539276


 29770/50000: episode: 207, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.384 [-0.981, 2.221], loss: 15.976538, mean_absolute_error: 42.427746, mean_q: 85.202156


 29970/50000: episode: 208, duration: 0.532s, episode steps: 200, steps per second: 376, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.222 [-1.540, 1.652], loss: 12.360159, mean_absolute_error: 42.264198, mean_q: 84.939079


 30170/50000: episode: 209, duration: 0.517s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.198 [-1.273, 1.283], loss: 10.962868, mean_absolute_error: 42.319283, mean_q: 84.977966


 30370/50000: episode: 210, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.244 [-1.373, 1.662], loss: 11.733879, mean_absolute_error: 42.272362, mean_q: 85.175789


 30570/50000: episode: 211, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.125 [-1.392, 1.721], loss: 7.842222, mean_absolute_error: 42.345402, mean_q: 85.313675


 30770/50000: episode: 212, duration: 0.532s, episode steps: 200, steps per second: 376, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.075 [-1.277, 1.281], loss: 13.123366, mean_absolute_error: 42.823742, mean_q: 85.989372


 30920/50000: episode: 213, duration: 0.399s, episode steps: 150, steps per second: 376, episode reward: 150.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.137 [-1.879, 1.753], loss: 15.226217, mean_absolute_error: 42.579044, mean_q: 85.533493


 31120/50000: episode: 214, duration: 0.525s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.200 [-1.335, 1.542], loss: 11.180306, mean_absolute_error: 42.518349, mean_q: 85.632767


 31320/50000: episode: 215, duration: 0.520s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.178 [-1.278, 1.330], loss: 10.192883, mean_absolute_error: 42.335880, mean_q: 85.363014


 31520/50000: episode: 216, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.090 [-1.285, 1.208], loss: 10.809512, mean_absolute_error: 42.550735, mean_q: 85.797874


 31720/50000: episode: 217, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.072 [-1.573, 1.601], loss: 19.060173, mean_absolute_error: 42.498947, mean_q: 85.332268


 31920/50000: episode: 218, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.079 [-1.705, 1.830], loss: 18.201618, mean_absolute_error: 42.243664, mean_q: 84.732414


 32120/50000: episode: 219, duration: 0.601s, episode steps: 200, steps per second: 333, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.037 [-1.624, 1.353], loss: 11.834787, mean_absolute_error: 41.782177, mean_q: 84.079384


 32320/50000: episode: 220, duration: 0.617s, episode steps: 200, steps per second: 324, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.189 [-1.410, 1.467], loss: 11.319877, mean_absolute_error: 41.901443, mean_q: 84.247307


 32520/50000: episode: 221, duration: 0.571s, episode steps: 200, steps per second: 350, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.044 [-1.585, 1.617], loss: 12.067901, mean_absolute_error: 41.889908, mean_q: 84.239563


 32720/50000: episode: 222, duration: 0.581s, episode steps: 200, steps per second: 344, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.307 [-1.245, 1.555], loss: 11.745431, mean_absolute_error: 42.082050, mean_q: 84.834015


 32920/50000: episode: 223, duration: 0.620s, episode steps: 200, steps per second: 322, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.280 [-1.468, 1.681], loss: 15.105668, mean_absolute_error: 42.028122, mean_q: 84.825722


 33120/50000: episode: 224, duration: 0.582s, episode steps: 200, steps per second: 343, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.269 [-1.020, 1.601], loss: 9.459899, mean_absolute_error: 42.413525, mean_q: 85.639297


 33316/50000: episode: 225, duration: 0.581s, episode steps: 196, steps per second: 337, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.430 [-1.260, 2.417], loss: 9.089149, mean_absolute_error: 42.821110, mean_q: 86.490913


 33516/50000: episode: 226, duration: 0.587s, episode steps: 200, steps per second: 341, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.258 [-1.005, 1.285], loss: 14.509814, mean_absolute_error: 42.821007, mean_q: 86.310974


 33716/50000: episode: 227, duration: 0.595s, episode steps: 200, steps per second: 336, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.297 [-0.984, 1.565], loss: 13.329343, mean_absolute_error: 42.996357, mean_q: 86.779434


 33916/50000: episode: 228, duration: 0.571s, episode steps: 200, steps per second: 350, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.341 [-1.082, 2.083], loss: 11.940948, mean_absolute_error: 43.225777, mean_q: 87.353050


 34116/50000: episode: 229, duration: 0.621s, episode steps: 200, steps per second: 322, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.357 [-1.011, 2.094], loss: 13.241629, mean_absolute_error: 43.398361, mean_q: 87.517326


 34316/50000: episode: 230, duration: 0.566s, episode steps: 200, steps per second: 353, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.244 [-0.728, 1.420], loss: 11.509502, mean_absolute_error: 43.177071, mean_q: 87.181702


 34516/50000: episode: 231, duration: 0.584s, episode steps: 200, steps per second: 342, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.267 [-1.048, 1.376], loss: 13.305539, mean_absolute_error: 43.327377, mean_q: 87.215668


 34716/50000: episode: 232, duration: 0.551s, episode steps: 200, steps per second: 363, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.099 [-1.272, 1.429], loss: 10.710091, mean_absolute_error: 43.463680, mean_q: 87.755646


 34916/50000: episode: 233, duration: 0.532s, episode steps: 200, steps per second: 376, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.200 [-1.176, 1.320], loss: 12.638686, mean_absolute_error: 43.515358, mean_q: 87.885139


 35116/50000: episode: 234, duration: 0.529s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.081 [-1.171, 1.163], loss: 8.684873, mean_absolute_error: 43.361122, mean_q: 87.621132


 35316/50000: episode: 235, duration: 0.520s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.327 [-1.258, 1.818], loss: 14.546597, mean_absolute_error: 43.528683, mean_q: 87.852692


 35516/50000: episode: 236, duration: 0.530s, episode steps: 200, steps per second: 378, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.170 [-1.473, 1.923], loss: 7.994064, mean_absolute_error: 43.670048, mean_q: 88.171501


 35716/50000: episode: 237, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.181 [-1.193, 1.361], loss: 11.165796, mean_absolute_error: 43.659634, mean_q: 88.094551


 35916/50000: episode: 238, duration: 0.537s, episode steps: 200, steps per second: 373, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.152 [-1.490, 1.490], loss: 10.949183, mean_absolute_error: 43.819454, mean_q: 88.345665


 36116/50000: episode: 239, duration: 0.540s, episode steps: 200, steps per second: 370, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.141 [-2.089, 1.999], loss: 10.065773, mean_absolute_error: 43.645084, mean_q: 88.016815


 36316/50000: episode: 240, duration: 0.603s, episode steps: 200, steps per second: 331, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.124 [-1.383, 1.724], loss: 10.850034, mean_absolute_error: 43.901413, mean_q: 88.342323


 36516/50000: episode: 241, duration: 0.544s, episode steps: 200, steps per second: 368, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.159 [-1.736, 1.448], loss: 6.900406, mean_absolute_error: 43.818077, mean_q: 88.363838


 36716/50000: episode: 242, duration: 0.561s, episode steps: 200, steps per second: 356, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.158 [-1.113, 1.365], loss: 11.773350, mean_absolute_error: 43.866371, mean_q: 88.193886


 36916/50000: episode: 243, duration: 0.530s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.124 [-1.641, 1.309], loss: 9.087672, mean_absolute_error: 43.782204, mean_q: 88.038322


 37116/50000: episode: 244, duration: 0.567s, episode steps: 200, steps per second: 353, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.198 [-1.049, 1.146], loss: 8.226263, mean_absolute_error: 43.857265, mean_q: 88.339798


 37316/50000: episode: 245, duration: 0.542s, episode steps: 200, steps per second: 369, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.064 [-1.477, 1.335], loss: 13.329337, mean_absolute_error: 43.973217, mean_q: 88.175407


 37516/50000: episode: 246, duration: 0.988s, episode steps: 200, steps per second: 202, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.223 [-1.622, 1.732], loss: 10.686899, mean_absolute_error: 43.470615, mean_q: 87.482735


 37716/50000: episode: 247, duration: 0.633s, episode steps: 200, steps per second: 316, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.282 [-1.182, 1.344], loss: 8.792107, mean_absolute_error: 43.607635, mean_q: 87.792595


 37916/50000: episode: 248, duration: 0.571s, episode steps: 200, steps per second: 350, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.283 [-1.182, 1.528], loss: 11.205630, mean_absolute_error: 43.254650, mean_q: 87.014832


 38116/50000: episode: 249, duration: 0.586s, episode steps: 200, steps per second: 341, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.298 [-1.011, 1.574], loss: 11.301955, mean_absolute_error: 43.586357, mean_q: 87.633278


 38316/50000: episode: 250, duration: 0.574s, episode steps: 200, steps per second: 349, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.277 [-1.202, 1.700], loss: 7.863056, mean_absolute_error: 43.447567, mean_q: 87.395844


 38516/50000: episode: 251, duration: 0.519s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.237 [-1.480, 1.649], loss: 10.340342, mean_absolute_error: 43.456612, mean_q: 87.359482


 38716/50000: episode: 252, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.324 [-1.044, 1.687], loss: 12.128665, mean_absolute_error: 43.158337, mean_q: 86.835724


 38916/50000: episode: 253, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.335 [-1.151, 1.969], loss: 11.408987, mean_absolute_error: 43.095547, mean_q: 86.839394


 39116/50000: episode: 254, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.193 [-0.952, 1.293], loss: 14.241421, mean_absolute_error: 42.733959, mean_q: 86.011612


 39316/50000: episode: 255, duration: 0.519s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.285 [-1.294, 1.702], loss: 10.934593, mean_absolute_error: 42.861519, mean_q: 86.355042


 39516/50000: episode: 256, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.301 [-0.842, 1.728], loss: 12.521641, mean_absolute_error: 42.713165, mean_q: 85.991951


 39716/50000: episode: 257, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.233 [-1.167, 1.549], loss: 10.796456, mean_absolute_error: 42.671764, mean_q: 86.104179


 39916/50000: episode: 258, duration: 0.519s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.292 [-1.565, 1.894], loss: 10.163157, mean_absolute_error: 42.787586, mean_q: 86.195778


 40116/50000: episode: 259, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.289 [-0.919, 1.587], loss: 10.972295, mean_absolute_error: 42.727398, mean_q: 86.068497


 40316/50000: episode: 260, duration: 0.516s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.151 [-1.165, 1.490], loss: 8.199177, mean_absolute_error: 42.571728, mean_q: 85.825974


 40516/50000: episode: 261, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.280 [-0.996, 1.734], loss: 7.866611, mean_absolute_error: 42.548168, mean_q: 86.019531


 40716/50000: episode: 262, duration: 0.519s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.195 [-1.011, 1.098], loss: 5.969899, mean_absolute_error: 42.819145, mean_q: 86.426170


 40916/50000: episode: 263, duration: 0.523s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.189 [-1.111, 1.303], loss: 8.451831, mean_absolute_error: 42.472198, mean_q: 85.626556


 41116/50000: episode: 264, duration: 0.531s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.262 [-1.149, 1.658], loss: 11.124822, mean_absolute_error: 42.532261, mean_q: 85.625710


 41316/50000: episode: 265, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.217 [-1.186, 1.362], loss: 9.394390, mean_absolute_error: 42.362728, mean_q: 85.229141


 41516/50000: episode: 266, duration: 0.528s, episode steps: 200, steps per second: 379, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.324 [-0.918, 2.211], loss: 7.071443, mean_absolute_error: 42.192860, mean_q: 85.075394


 41710/50000: episode: 267, duration: 0.506s, episode steps: 194, steps per second: 383, episode reward: 194.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.409 [-1.218, 2.415], loss: 6.160014, mean_absolute_error: 42.581467, mean_q: 85.850815


 41910/50000: episode: 268, duration: 0.538s, episode steps: 200, steps per second: 372, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.086 [-1.023, 1.171], loss: 9.240800, mean_absolute_error: 42.752628, mean_q: 86.127754


 42110/50000: episode: 269, duration: 0.526s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.229 [-0.942, 1.424], loss: 7.559816, mean_absolute_error: 42.740791, mean_q: 86.149490


 42310/50000: episode: 270, duration: 0.525s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.208 [-1.082, 1.545], loss: 7.306564, mean_absolute_error: 42.745850, mean_q: 86.146896


 42479/50000: episode: 271, duration: 0.444s, episode steps: 169, steps per second: 380, episode reward: 169.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.408 [-1.278, 2.421], loss: 11.510160, mean_absolute_error: 42.655064, mean_q: 85.902763


 42679/50000: episode: 272, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.175 [-1.213, 1.278], loss: 5.149321, mean_absolute_error: 42.665295, mean_q: 85.992828


 42879/50000: episode: 273, duration: 0.521s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.227 [-1.143, 1.410], loss: 10.407898, mean_absolute_error: 42.583694, mean_q: 85.773476


 43079/50000: episode: 274, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.386 [-0.946, 2.151], loss: 9.455733, mean_absolute_error: 42.464756, mean_q: 85.536156


 43279/50000: episode: 275, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.359 [-1.429, 2.269], loss: 9.499067, mean_absolute_error: 42.411686, mean_q: 85.447617


 43465/50000: episode: 276, duration: 0.484s, episode steps: 186, steps per second: 385, episode reward: 186.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.419 [-0.976, 2.403], loss: 5.809301, mean_absolute_error: 42.073566, mean_q: 84.974800


 43665/50000: episode: 277, duration: 0.516s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.096 [-1.478, 1.452], loss: 7.923213, mean_absolute_error: 42.282921, mean_q: 85.315491


 43865/50000: episode: 278, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.285 [-0.906, 1.918], loss: 8.115348, mean_absolute_error: 42.169018, mean_q: 85.009834


 44065/50000: episode: 279, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.172 [-1.580, 1.542], loss: 7.206708, mean_absolute_error: 42.433571, mean_q: 85.354424


 44265/50000: episode: 280, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.100 [-1.508, 1.274], loss: 6.577838, mean_absolute_error: 42.152802, mean_q: 84.941795


 44457/50000: episode: 281, duration: 0.495s, episode steps: 192, steps per second: 388, episode reward: 192.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.410 [-1.041, 2.407], loss: 6.081545, mean_absolute_error: 42.141132, mean_q: 84.918358


 44657/50000: episode: 282, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.233 [-1.310, 1.585], loss: 7.380685, mean_absolute_error: 41.893467, mean_q: 84.411736


 44857/50000: episode: 283, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.400 [-1.047, 2.395], loss: 8.470584, mean_absolute_error: 42.306515, mean_q: 85.147530


 45057/50000: episode: 284, duration: 0.520s, episode steps: 200, steps per second: 384, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.081 [-1.291, 1.257], loss: 5.358489, mean_absolute_error: 41.791836, mean_q: 84.135887


 45257/50000: episode: 285, duration: 0.526s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.288 [-1.286, 1.706], loss: 8.833231, mean_absolute_error: 41.741177, mean_q: 83.860672


 45457/50000: episode: 286, duration: 0.524s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.360 [-1.079, 2.129], loss: 9.534994, mean_absolute_error: 41.692944, mean_q: 83.725296


 45657/50000: episode: 287, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.322 [-1.063, 1.906], loss: 6.266841, mean_absolute_error: 41.516632, mean_q: 83.716087


 45857/50000: episode: 288, duration: 0.530s, episode steps: 200, steps per second: 377, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.362 [-1.257, 2.080], loss: 6.230986, mean_absolute_error: 41.485294, mean_q: 83.548927


 46057/50000: episode: 289, duration: 0.525s, episode steps: 200, steps per second: 381, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.305 [-1.173, 1.836], loss: 6.899157, mean_absolute_error: 41.555862, mean_q: 83.610985


 46257/50000: episode: 290, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.256 [-1.245, 1.554], loss: 5.332394, mean_absolute_error: 41.249668, mean_q: 83.172096


 46457/50000: episode: 291, duration: 0.517s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.131 [-1.254, 1.309], loss: 8.615246, mean_absolute_error: 41.562481, mean_q: 83.440155


 46657/50000: episode: 292, duration: 0.512s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.237 [-1.298, 1.382], loss: 7.992279, mean_absolute_error: 41.353397, mean_q: 83.053780


 46857/50000: episode: 293, duration: 0.518s, episode steps: 200, steps per second: 386, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.216 [-1.319, 1.389], loss: 7.361880, mean_absolute_error: 41.307362, mean_q: 82.921898


 47057/50000: episode: 294, duration: 0.515s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.087 [-1.248, 1.444], loss: 10.491690, mean_absolute_error: 40.985474, mean_q: 82.329628


 47257/50000: episode: 295, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.143 [-1.325, 1.495], loss: 7.652686, mean_absolute_error: 40.757885, mean_q: 82.034607


 47457/50000: episode: 296, duration: 0.516s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.375 [-1.305, 2.226], loss: 7.297657, mean_absolute_error: 40.872387, mean_q: 82.321808


 47657/50000: episode: 297, duration: 0.520s, episode steps: 200, steps per second: 385, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.332 [-0.954, 1.664], loss: 4.350832, mean_absolute_error: 40.563148, mean_q: 81.883133


 47857/50000: episode: 298, duration: 0.522s, episode steps: 200, steps per second: 383, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.418 [-1.768, 2.319], loss: 8.354321, mean_absolute_error: 40.731270, mean_q: 81.988075


 48052/50000: episode: 299, duration: 0.503s, episode steps: 195, steps per second: 388, episode reward: 195.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.422 [-1.251, 2.409], loss: 8.538136, mean_absolute_error: 40.604889, mean_q: 81.751022


 48252/50000: episode: 300, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.089 [-1.662, 1.755], loss: 8.155535, mean_absolute_error: 40.312046, mean_q: 81.296341


 48452/50000: episode: 301, duration: 0.517s, episode steps: 200, steps per second: 387, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.384 [-1.485, 2.048], loss: 5.533853, mean_absolute_error: 40.543839, mean_q: 81.696312


 48636/50000: episode: 302, duration: 0.475s, episode steps: 184, steps per second: 387, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.416 [-1.501, 2.410], loss: 6.354597, mean_absolute_error: 40.511803, mean_q: 81.447449


 48836/50000: episode: 303, duration: 0.527s, episode steps: 200, steps per second: 380, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.257 [-1.417, 1.554], loss: 8.217782, mean_absolute_error: 40.481800, mean_q: 81.308632


 49036/50000: episode: 304, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.391 [-1.159, 2.089], loss: 7.636243, mean_absolute_error: 40.700565, mean_q: 81.740608


 49236/50000: episode: 305, duration: 0.535s, episode steps: 200, steps per second: 374, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.248 [-1.122, 1.719], loss: 6.233873, mean_absolute_error: 39.647949, mean_q: 79.679665


 49436/50000: episode: 306, duration: 0.550s, episode steps: 200, steps per second: 364, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.364 [-1.697, 2.126], loss: 8.010751, mean_absolute_error: 40.165760, mean_q: 80.730644


 49636/50000: episode: 307, duration: 0.536s, episode steps: 200, steps per second: 373, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.137 [-1.848, 1.884], loss: 6.890379, mean_absolute_error: 40.184166, mean_q: 80.795280


 49832/50000: episode: 308, duration: 0.536s, episode steps: 196, steps per second: 366, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.408 [-1.639, 2.414], loss: 4.626545, mean_absolute_error: 40.276558, mean_q: 81.093483


done, took 135.205 seconds


<keras.callbacks.History at 0x231778c0eb8>

In [25]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [26]:
dqn.test(env, nb_episodes=5, visualize=True)
env.render(close=True)

Testing for 5 episodes ...


Episode 1: reward: 200.000, steps: 200


Episode 2: reward: 200.000, steps: 200


AttributeError: 'NoneType' object has no attribute 'flip'

In [6]:
dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))

In [98]:
env.reset()

array([-0.00417311, -0.03291649,  0.04090877,  0.01345695])

In [79]:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
print(action, observation, reward, done, info)

0 [ 0.03676252 -0.20705747  0.04376185  0.31327263] 1.0 False {}


In [97]:
for i in range(20):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    env.render()
    print(action, observation, reward, done, info)
    time.sleep(1)
env.render(close=True)

1 [ 0.01944849  0.22771966  0.01550582 -0.32227389] 1.0 False {}


1 [ 0.02400288  0.42261741  0.00906034 -0.61002684] 1.0 False {}


0 [ 0.03245523  0.22736999 -0.00314019 -0.31450403] 1.0 False {}


0 [ 0.03700263  0.03229291 -0.00943027 -0.02281307] 1.0 False {}


0 [ 0.03764849 -0.16269254 -0.00988654  0.26687964] 1.0 False {}


0 [ 0.03439464 -0.357672   -0.00454894  0.55642795] 1.0 False {}


1 [ 0.0272412  -0.16248648  0.00657962  0.26231532] 1.0 False {}


0 [ 0.02399147 -0.35770173  0.01182592  0.55706625] 1.0 False {}


1 [ 0.01683744 -0.16274778  0.02296725  0.26813251] 1.0 False {}


0 [ 0.01358248 -0.35818985  0.0283299   0.56797007] 1.0 False {}


1 [ 0.00641868 -0.16347649  0.0396893   0.2843451 ] 1.0 False {}


1 [ 0.00314915  0.03105758  0.0453762   0.00443939] 1.0 False {}


0 [ 0.0037703  -0.16468477  0.04546499  0.31108672] 1.0 False {}


0 [  4.76608807e-04  -3.60423983e-01   5.16867232e-02   6.17754236e-01] 1.0 False {}


1 [-0.00673187 -0.1660607   0.06404181  0.34178779] 1.0 False {}


1 [-0.01005308  0.02809441  0.07087756  0.06996756] 1.0 False {}


0 [-0.0094912  -0.16796834  0.07227691  0.38414385] 1.0 False {}


1 [-0.01285056  0.02605703  0.07995979  0.11509658] 1.0 False {}


1 [-0.01232942  0.21994764  0.08226172 -0.15132696] 1.0 False {}


1 [-0.00793047  0.41380122  0.07923518 -0.41696635] 1.0 False {}


In [94]:
np.random.rand()

0.6700301041931712

In [103]:
env.render()

In [104]:
env.render(close=True)

In [102]:
env = gym.make('CartPole-v0')
for i_episode in range(1):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(action, reward, done)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.render(close=True)

[2017-10-23 15:46:04,848] Making new env: CartPole-v0


[ 0.00782149  0.01473252 -0.00262509 -0.03426497]
1 1.0 False
[ 0.00811614  0.20989202 -0.00331039 -0.32777498]
1 1.0 False
[ 0.01231398  0.40506094 -0.00986589 -0.62150002]
0 1.0 False
[ 0.0204152   0.21007814 -0.02229589 -0.33194058]
1 1.0 False
[ 0.02461676  0.40551024 -0.0289347  -0.63157031]
0 1.0 False
[ 0.03272697  0.21080369 -0.04156611 -0.34813835]
1 1.0 False
[ 0.03694304  0.40649143 -0.04852887 -0.65363335]
1 1.0 False
[ 0.04507287  0.60225434 -0.06160154 -0.96119389]
0 1.0 False
[ 0.05711795  0.40801199 -0.08082542 -0.68848241]
0 1.0 False
[ 0.06527819  0.2140993  -0.09459507 -0.42229976]
1 1.0 False
[ 0.06956018  0.41042521 -0.10304106 -0.74324152]
1 1.0 False
[ 0.07776868  0.60680702 -0.11790589 -1.06649231]
1 1.0 False
[ 0.08990483  0.80327483 -0.13923574 -1.39373203]
1 

1.0 False
[ 0.10597032  0.99982788 -0.16711038 -1.72651152]
0 1.0 False
[ 0.12596688  0.80696488 -0.20164061 -1.49014913]
1 1.0 True
Episode finished after 15 timesteps


In [None]:
env = gym.make('CartPole-v1')
env.reset()
for i_episode in range(1):
    observation = env.reset()
    for t in range(2000):
        env.render()
        if t % 3 == 0:
            action = env.action_space.sample()
        else:
            action = np.argmax(model.predict(observation.reshape(1, 1, 4)))
        observation, reward, done, _ = env.step(action)
        # time.sleep(0.01)
env.render(close=True)