# Andrew Whirisky - 17200679 and Neil Jones - 17202155

# Imports

In [2]:
import numpy as np
import pandas as pd
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
ENV_NAME = 'LunarLander-v2'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=500000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=700000, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               1152      
_________________________________________________________________
activation_9 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_10 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               16512     
__________________________________



    141/700000: episode: 1, duration: 1.912s, episode steps: 141, steps per second: 74, episode reward: -466.933, mean reward: -3.312 [-100.000, 3.484], mean action: 1.709 [0.000, 3.000], mean observation: 0.117 [-1.496, 2.291], loss: 1.214043, mean_absolute_error: 1.790543, mean_q: -0.881507
    246/700000: episode: 2, duration: 0.628s, episode steps: 105, steps per second: 167, episode reward: -149.989, mean reward: -1.428 [-100.000, 2.863], mean action: 1.562 [0.000, 3.000], mean observation: 0.229 [-0.547, 1.083], loss: 23.277130, mean_absolute_error: 3.757147, mean_q: -2.688236
    409/700000: episode: 3, duration: 0.988s, episode steps: 163, steps per second: 165, episode reward: -82.222, mean reward: -0.504 [-100.000, 12.710], mean action: 1.957 [0.000, 3.000], mean observation: 0.020 [-0.807, 1.000], loss: 25.722189, mean_absolute_error: 4.110884, mean_q: -2.902601
    779/700000: episode: 4, duration: 1.970s, episode steps: 370, steps per second: 188, episode reward: -536.878,

  18264/700000: episode: 29, duration: 6.525s, episode steps: 1000, steps per second: 153, episode reward: -33.342, mean reward: -0.033 [-11.549, 13.776], mean action: 1.867 [0.000, 3.000], mean observation: 0.019 [-0.719, 1.000], loss: 4.353009, mean_absolute_error: 14.994679, mean_q: 18.186375
  19264/700000: episode: 30, duration: 6.135s, episode steps: 1000, steps per second: 163, episode reward: -43.251, mean reward: -0.043 [-19.353, 16.785], mean action: 1.804 [0.000, 3.000], mean observation: 0.044 [-0.740, 1.000], loss: 2.792253, mean_absolute_error: 15.184410, mean_q: 18.416075
  19493/700000: episode: 31, duration: 1.138s, episode steps: 229, steps per second: 201, episode reward: -60.403, mean reward: -0.264 [-100.000, 53.166], mean action: 1.345 [0.000, 3.000], mean observation: 0.138 [-0.970, 1.000], loss: 2.191468, mean_absolute_error: 15.405270, mean_q: 18.473156
  20493/700000: episode: 32, duration: 6.727s, episode steps: 1000, steps per second: 149, episode reward: -5

  29930/700000: episode: 57, duration: 0.672s, episode steps: 135, steps per second: 201, episode reward: -82.428, mean reward: -0.611 [-100.000, 61.747], mean action: 1.696 [0.000, 3.000], mean observation: 0.100 [-0.838, 2.220], loss: 28.370371, mean_absolute_error: 22.039919, mean_q: 28.479109
  30930/700000: episode: 58, duration: 6.254s, episode steps: 1000, steps per second: 160, episode reward: -44.875, mean reward: -0.045 [-21.686, 21.989], mean action: 1.582 [0.000, 3.000], mean observation: 0.117 [-0.380, 1.000], loss: 7.174170, mean_absolute_error: 21.986164, mean_q: 28.677841
  31930/700000: episode: 59, duration: 6.497s, episode steps: 1000, steps per second: 154, episode reward: -32.256, mean reward: -0.032 [-21.495, 15.899], mean action: 1.710 [0.000, 3.000], mean observation: 0.031 [-0.744, 1.000], loss: 7.869471, mean_absolute_error: 22.421822, mean_q: 29.293188
  32732/700000: episode: 60, duration: 4.361s, episode steps: 802, steps per second: 184, episode reward: 14

  47879/700000: episode: 85, duration: 2.356s, episode steps: 430, steps per second: 183, episode reward: 103.491, mean reward: 0.241 [-13.137, 100.000], mean action: 1.551 [0.000, 3.000], mean observation: 0.008 [-0.764, 1.000], loss: 5.830829, mean_absolute_error: 30.400738, mean_q: 40.380692
  48063/700000: episode: 86, duration: 0.910s, episode steps: 184, steps per second: 202, episode reward: -61.753, mean reward: -0.336 [-100.000, 12.998], mean action: 1.467 [0.000, 3.000], mean observation: 0.028 [-0.891, 1.000], loss: 7.501368, mean_absolute_error: 30.173916, mean_q: 40.005604
  48436/700000: episode: 87, duration: 1.954s, episode steps: 373, steps per second: 191, episode reward: 237.069, mean reward: 0.636 [-19.548, 100.000], mean action: 1.413 [0.000, 3.000], mean observation: 0.103 [-0.710, 1.000], loss: 8.899549, mean_absolute_error: 30.239313, mean_q: 40.085289
  48821/700000: episode: 88, duration: 2.096s, episode steps: 385, steps per second: 184, episode reward: 125.3

  57028/700000: episode: 113, duration: 0.534s, episode steps: 108, steps per second: 202, episode reward: -12.682, mean reward: -0.117 [-100.000, 14.362], mean action: 1.519 [0.000, 3.000], mean observation: -0.078 [-1.099, 1.048], loss: 8.043242, mean_absolute_error: 34.251884, mean_q: 45.809505
  57606/700000: episode: 114, duration: 3.076s, episode steps: 578, steps per second: 188, episode reward: 136.834, mean reward: 0.237 [-23.118, 100.000], mean action: 1.147 [0.000, 3.000], mean observation: 0.175 [-0.671, 1.000], loss: 18.500505, mean_absolute_error: 34.479733, mean_q: 46.412003
  57737/700000: episode: 115, duration: 0.651s, episode steps: 131, steps per second: 201, episode reward: -47.566, mean reward: -0.363 [-100.000, 13.698], mean action: 1.679 [0.000, 3.000], mean observation: 0.071 [-1.049, 1.008], loss: 7.158634, mean_absolute_error: 34.140228, mean_q: 45.660431
  57853/700000: episode: 116, duration: 0.566s, episode steps: 116, steps per second: 205, episode reward

  65384/700000: episode: 141, duration: 2.338s, episode steps: 398, steps per second: 170, episode reward: -256.049, mean reward: -0.643 [-100.000, 19.037], mean action: 1.789 [0.000, 3.000], mean observation: -0.053 [-1.548, 1.000], loss: 7.871044, mean_absolute_error: 39.281235, mean_q: 52.745094
  65901/700000: episode: 142, duration: 3.120s, episode steps: 517, steps per second: 166, episode reward: 213.922, mean reward: 0.414 [-17.375, 100.000], mean action: 1.308 [0.000, 3.000], mean observation: 0.112 [-0.771, 1.015], loss: 14.653708, mean_absolute_error: 39.103958, mean_q: 52.284832
  66414/700000: episode: 143, duration: 2.975s, episode steps: 513, steps per second: 172, episode reward: 92.562, mean reward: 0.180 [-17.843, 100.000], mean action: 1.750 [0.000, 3.000], mean observation: 0.015 [-0.892, 1.002], loss: 14.309364, mean_absolute_error: 39.376102, mean_q: 52.697056
  67414/700000: episode: 144, duration: 6.109s, episode steps: 1000, steps per second: 164, episode rewar

  77447/700000: episode: 169, duration: 0.902s, episode steps: 171, steps per second: 190, episode reward: -49.296, mean reward: -0.288 [-100.000, 15.879], mean action: 1.620 [0.000, 3.000], mean observation: -0.028 [-1.141, 1.000], loss: 14.976621, mean_absolute_error: 42.348072, mean_q: 56.527309
  77821/700000: episode: 170, duration: 2.109s, episode steps: 374, steps per second: 177, episode reward: 216.360, mean reward: 0.579 [-10.714, 100.000], mean action: 1.444 [0.000, 3.000], mean observation: 0.111 [-0.589, 1.000], loss: 7.942354, mean_absolute_error: 42.850498, mean_q: 57.476376
  77997/700000: episode: 171, duration: 0.873s, episode steps: 176, steps per second: 202, episode reward: -56.812, mean reward: -0.323 [-100.000, 9.736], mean action: 1.358 [0.000, 3.000], mean observation: 0.038 [-0.745, 1.000], loss: 14.248040, mean_absolute_error: 42.632629, mean_q: 57.106045
  78475/700000: episode: 172, duration: 2.530s, episode steps: 478, steps per second: 189, episode reward

  88836/700000: episode: 197, duration: 0.575s, episode steps: 115, steps per second: 200, episode reward: -82.686, mean reward: -0.719 [-100.000, 10.085], mean action: 1.635 [0.000, 3.000], mean observation: -0.075 [-0.794, 3.042], loss: 9.529130, mean_absolute_error: 44.650139, mean_q: 59.883198
  89270/700000: episode: 198, duration: 2.258s, episode steps: 434, steps per second: 192, episode reward: 228.050, mean reward: 0.525 [-11.476, 100.000], mean action: 1.187 [0.000, 3.000], mean observation: 0.134 [-0.617, 1.000], loss: 10.896939, mean_absolute_error: 44.655975, mean_q: 59.767757
  89782/700000: episode: 199, duration: 2.892s, episode steps: 512, steps per second: 177, episode reward: 177.516, mean reward: 0.347 [-18.342, 100.000], mean action: 1.658 [0.000, 3.000], mean observation: 0.090 [-1.041, 1.266], loss: 8.942348, mean_absolute_error: 45.020309, mean_q: 60.283909
  90782/700000: episode: 200, duration: 5.573s, episode steps: 1000, steps per second: 179, episode reward

 101057/700000: episode: 225, duration: 1.859s, episode steps: 360, steps per second: 194, episode reward: 228.583, mean reward: 0.635 [-20.841, 100.000], mean action: 1.517 [0.000, 3.000], mean observation: 0.053 [-0.808, 1.000], loss: 10.600630, mean_absolute_error: 46.216526, mean_q: 61.799984
 101171/700000: episode: 226, duration: 0.572s, episode steps: 114, steps per second: 199, episode reward: -47.904, mean reward: -0.420 [-100.000, 15.635], mean action: 1.798 [0.000, 3.000], mean observation: -0.029 [-0.784, 1.609], loss: 6.442275, mean_absolute_error: 46.399052, mean_q: 62.117992
 101449/700000: episode: 227, duration: 1.411s, episode steps: 278, steps per second: 197, episode reward: 189.169, mean reward: 0.680 [-17.902, 100.000], mean action: 1.209 [0.000, 3.000], mean observation: 0.062 [-1.025, 1.000], loss: 7.210200, mean_absolute_error: 46.505829, mean_q: 62.171341
 101759/700000: episode: 228, duration: 1.582s, episode steps: 310, steps per second: 196, episode reward:

 111594/700000: episode: 253, duration: 1.210s, episode steps: 242, steps per second: 200, episode reward: 236.174, mean reward: 0.976 [-2.768, 100.000], mean action: 1.322 [0.000, 3.000], mean observation: 0.096 [-0.761, 1.017], loss: 23.965893, mean_absolute_error: 45.349480, mean_q: 60.534996
 111699/700000: episode: 254, duration: 0.526s, episode steps: 105, steps per second: 200, episode reward: -73.376, mean reward: -0.699 [-100.000, 9.466], mean action: 1.486 [0.000, 3.000], mean observation: 0.026 [-1.209, 1.000], loss: 43.706772, mean_absolute_error: 45.252140, mean_q: 60.406254
 111824/700000: episode: 255, duration: 0.612s, episode steps: 125, steps per second: 204, episode reward: -361.334, mean reward: -2.891 [-100.000, 97.707], mean action: 1.192 [0.000, 3.000], mean observation: -0.156 [-3.443, 1.000], loss: 16.565248, mean_absolute_error: 45.091572, mean_q: 60.498219
 112120/700000: episode: 256, duration: 1.537s, episode steps: 296, steps per second: 193, episode rewar

 121807/700000: episode: 281, duration: 1.188s, episode steps: 233, steps per second: 196, episode reward: 252.348, mean reward: 1.083 [-10.780, 100.000], mean action: 1.275 [0.000, 3.000], mean observation: 0.108 [-1.265, 1.000], loss: 8.885400, mean_absolute_error: 47.866833, mean_q: 63.960720
 122477/700000: episode: 282, duration: 3.770s, episode steps: 670, steps per second: 178, episode reward: 187.886, mean reward: 0.280 [-17.955, 100.000], mean action: 1.169 [0.000, 3.000], mean observation: 0.126 [-0.711, 1.011], loss: 13.037686, mean_absolute_error: 47.850037, mean_q: 63.809292
 122817/700000: episode: 283, duration: 1.778s, episode steps: 340, steps per second: 191, episode reward: 215.473, mean reward: 0.634 [-17.761, 100.000], mean action: 1.059 [0.000, 3.000], mean observation: 0.137 [-0.708, 1.000], loss: 13.747090, mean_absolute_error: 47.836979, mean_q: 63.997162
 123108/700000: episode: 284, duration: 1.472s, episode steps: 291, steps per second: 198, episode reward: 

 133169/700000: episode: 309, duration: 0.910s, episode steps: 181, steps per second: 199, episode reward: -33.987, mean reward: -0.188 [-100.000, 17.041], mean action: 1.674 [0.000, 3.000], mean observation: 0.060 [-0.973, 1.000], loss: 14.069118, mean_absolute_error: 48.080952, mean_q: 63.753162
 133588/700000: episode: 310, duration: 2.233s, episode steps: 419, steps per second: 188, episode reward: -463.565, mean reward: -1.106 [-100.000, 5.086], mean action: 1.699 [0.000, 3.000], mean observation: 0.020 [-1.321, 1.505], loss: 13.661745, mean_absolute_error: 47.809322, mean_q: 63.600368
 133828/700000: episode: 311, duration: 1.219s, episode steps: 240, steps per second: 197, episode reward: 220.440, mean reward: 0.918 [-7.966, 100.000], mean action: 1.225 [0.000, 3.000], mean observation: 0.100 [-0.908, 1.000], loss: 11.370530, mean_absolute_error: 47.576214, mean_q: 63.517529
 134321/700000: episode: 312, duration: 2.751s, episode steps: 493, steps per second: 179, episode reward

 143237/700000: episode: 337, duration: 1.789s, episode steps: 184, steps per second: 103, episode reward: 222.141, mean reward: 1.207 [-17.658, 100.000], mean action: 1.337 [0.000, 3.000], mean observation: 0.069 [-1.115, 1.000], loss: 10.408929, mean_absolute_error: 50.818478, mean_q: 68.141083
 143372/700000: episode: 338, duration: 0.860s, episode steps: 135, steps per second: 157, episode reward: -22.194, mean reward: -0.164 [-100.000, 17.785], mean action: 1.489 [0.000, 3.000], mean observation: 0.079 [-0.953, 1.997], loss: 5.544368, mean_absolute_error: 50.830837, mean_q: 67.930672
 143594/700000: episode: 339, duration: 1.524s, episode steps: 222, steps per second: 146, episode reward: -24.819, mean reward: -0.112 [-100.000, 11.910], mean action: 1.721 [0.000, 3.000], mean observation: 0.037 [-0.949, 1.180], loss: 17.057508, mean_absolute_error: 50.299702, mean_q: 67.235558
 143777/700000: episode: 340, duration: 1.327s, episode steps: 183, steps per second: 138, episode reward

 154141/700000: episode: 365, duration: 1.050s, episode steps: 193, steps per second: 184, episode reward: 212.302, mean reward: 1.100 [-4.405, 100.000], mean action: 1.161 [0.000, 3.000], mean observation: 0.027 [-0.890, 1.000], loss: 17.944920, mean_absolute_error: 48.974701, mean_q: 65.398781
 154264/700000: episode: 366, duration: 0.915s, episode steps: 123, steps per second: 134, episode reward: -15.645, mean reward: -0.127 [-100.000, 16.470], mean action: 1.699 [0.000, 3.000], mean observation: 0.035 [-0.926, 1.000], loss: 27.124872, mean_absolute_error: 48.943577, mean_q: 65.232361
 154369/700000: episode: 367, duration: 0.525s, episode steps: 105, steps per second: 200, episode reward: -2.367, mean reward: -0.023 [-100.000, 22.428], mean action: 1.733 [0.000, 3.000], mean observation: 0.025 [-0.853, 1.428], loss: 8.112839, mean_absolute_error: 49.219025, mean_q: 65.837509
 154824/700000: episode: 368, duration: 2.523s, episode steps: 455, steps per second: 180, episode reward: 

 162970/700000: episode: 393, duration: 1.073s, episode steps: 169, steps per second: 158, episode reward: 10.891, mean reward: 0.064 [-100.000, 16.766], mean action: 1.710 [0.000, 3.000], mean observation: 0.077 [-0.681, 1.411], loss: 13.298999, mean_absolute_error: 49.650223, mean_q: 66.116196
 163203/700000: episode: 394, duration: 1.716s, episode steps: 233, steps per second: 136, episode reward: -405.242, mean reward: -1.739 [-100.000, 3.466], mean action: 1.378 [0.000, 3.000], mean observation: -0.063 [-1.511, 1.931], loss: 7.146236, mean_absolute_error: 49.297901, mean_q: 65.900536
 163600/700000: episode: 395, duration: 2.473s, episode steps: 397, steps per second: 161, episode reward: 202.835, mean reward: 0.511 [-17.625, 100.000], mean action: 1.005 [0.000, 3.000], mean observation: 0.182 [-0.902, 1.000], loss: 8.736560, mean_absolute_error: 49.541973, mean_q: 66.057556
 163719/700000: episode: 396, duration: 0.681s, episode steps: 119, steps per second: 175, episode reward: 

 173479/700000: episode: 421, duration: 0.749s, episode steps: 140, steps per second: 187, episode reward: -125.791, mean reward: -0.899 [-100.000, 10.829], mean action: 1.357 [0.000, 3.000], mean observation: -0.090 [-0.911, 1.352], loss: 8.958635, mean_absolute_error: 48.073704, mean_q: 64.405609
 173741/700000: episode: 422, duration: 1.343s, episode steps: 262, steps per second: 195, episode reward: 218.335, mean reward: 0.833 [-18.801, 100.000], mean action: 1.179 [0.000, 3.000], mean observation: 0.096 [-0.887, 1.000], loss: 7.690593, mean_absolute_error: 48.049931, mean_q: 64.153229
 174098/700000: episode: 423, duration: 2.196s, episode steps: 357, steps per second: 163, episode reward: 186.072, mean reward: 0.521 [-17.531, 100.000], mean action: 1.333 [0.000, 3.000], mean observation: 0.181 [-0.870, 1.000], loss: 10.326203, mean_absolute_error: 48.626938, mean_q: 64.676949
 174333/700000: episode: 424, duration: 1.170s, episode steps: 235, steps per second: 201, episode reward

 183615/700000: episode: 449, duration: 4.623s, episode steps: 809, steps per second: 175, episode reward: 169.281, mean reward: 0.209 [-22.492, 100.000], mean action: 1.252 [0.000, 3.000], mean observation: 0.190 [-0.790, 1.000], loss: 10.737769, mean_absolute_error: 47.056946, mean_q: 62.534271
 183915/700000: episode: 450, duration: 1.543s, episode steps: 300, steps per second: 194, episode reward: 231.175, mean reward: 0.771 [-20.280, 100.000], mean action: 1.300 [0.000, 3.000], mean observation: 0.128 [-0.885, 1.000], loss: 10.827659, mean_absolute_error: 47.208538, mean_q: 62.905468
 184301/700000: episode: 451, duration: 2.040s, episode steps: 386, steps per second: 189, episode reward: 174.585, mean reward: 0.452 [-17.228, 100.000], mean action: 1.360 [0.000, 3.000], mean observation: 0.148 [-0.835, 1.000], loss: 13.205199, mean_absolute_error: 47.214069, mean_q: 62.953091
 184586/700000: episode: 452, duration: 1.481s, episode steps: 285, steps per second: 192, episode reward:

 193435/700000: episode: 477, duration: 1.559s, episode steps: 301, steps per second: 193, episode reward: 206.395, mean reward: 0.686 [-17.375, 100.000], mean action: 1.123 [0.000, 3.000], mean observation: 0.135 [-0.663, 1.084], loss: 12.704515, mean_absolute_error: 47.624107, mean_q: 63.086533
 193852/700000: episode: 478, duration: 2.259s, episode steps: 417, steps per second: 185, episode reward: 191.944, mean reward: 0.460 [-17.438, 100.000], mean action: 1.295 [0.000, 3.000], mean observation: 0.089 [-1.474, 1.000], loss: 10.155291, mean_absolute_error: 47.617786, mean_q: 63.114388
 194104/700000: episode: 479, duration: 1.266s, episode steps: 252, steps per second: 199, episode reward: 192.307, mean reward: 0.763 [-19.402, 100.000], mean action: 0.964 [0.000, 3.000], mean observation: 0.079 [-0.809, 1.000], loss: 16.099916, mean_absolute_error: 47.470428, mean_q: 62.468037
 194238/700000: episode: 480, duration: 0.667s, episode steps: 134, steps per second: 201, episode reward:

 202830/700000: episode: 505, duration: 2.161s, episode steps: 411, steps per second: 190, episode reward: 226.031, mean reward: 0.550 [-19.143, 100.000], mean action: 1.399 [0.000, 3.000], mean observation: 0.117 [-0.790, 1.020], loss: 6.675127, mean_absolute_error: 48.404274, mean_q: 64.282364
 203442/700000: episode: 506, duration: 3.222s, episode steps: 612, steps per second: 190, episode reward: 223.361, mean reward: 0.365 [-17.432, 100.000], mean action: 1.291 [0.000, 3.000], mean observation: 0.127 [-0.983, 1.128], loss: 12.361053, mean_absolute_error: 48.199406, mean_q: 63.730583
 203785/700000: episode: 507, duration: 1.768s, episode steps: 343, steps per second: 194, episode reward: 234.721, mean reward: 0.684 [-20.815, 100.000], mean action: 0.988 [0.000, 3.000], mean observation: 0.105 [-0.794, 1.000], loss: 10.130342, mean_absolute_error: 47.989544, mean_q: 63.589279
 203982/700000: episode: 508, duration: 0.991s, episode steps: 197, steps per second: 199, episode reward: 

 213458/700000: episode: 533, duration: 2.344s, episode steps: 452, steps per second: 193, episode reward: 206.512, mean reward: 0.457 [-9.600, 100.000], mean action: 1.619 [0.000, 3.000], mean observation: 0.037 [-0.748, 1.093], loss: 10.793458, mean_absolute_error: 49.066166, mean_q: 65.254288
 213778/700000: episode: 534, duration: 1.683s, episode steps: 320, steps per second: 190, episode reward: 235.016, mean reward: 0.734 [-3.490, 100.000], mean action: 1.144 [0.000, 3.000], mean observation: 0.138 [-0.685, 1.000], loss: 8.520460, mean_absolute_error: 48.916466, mean_q: 65.290421
 214048/700000: episode: 535, duration: 1.380s, episode steps: 270, steps per second: 196, episode reward: 212.647, mean reward: 0.788 [-8.078, 100.000], mean action: 1.189 [0.000, 3.000], mean observation: 0.103 [-1.218, 1.000], loss: 11.021686, mean_absolute_error: 48.900059, mean_q: 64.663742
 214420/700000: episode: 536, duration: 1.972s, episode steps: 372, steps per second: 189, episode reward: 235

 222985/700000: episode: 561, duration: 3.156s, episode steps: 583, steps per second: 185, episode reward: 202.467, mean reward: 0.347 [-21.142, 100.000], mean action: 0.998 [0.000, 3.000], mean observation: 0.170 [-0.856, 1.000], loss: 7.186007, mean_absolute_error: 48.733330, mean_q: 64.805542
 223594/700000: episode: 562, duration: 3.363s, episode steps: 609, steps per second: 181, episode reward: 208.247, mean reward: 0.342 [-17.554, 100.000], mean action: 1.159 [0.000, 3.000], mean observation: 0.103 [-0.662, 1.000], loss: 9.578723, mean_absolute_error: 48.722069, mean_q: 64.411690
 223958/700000: episode: 563, duration: 1.838s, episode steps: 364, steps per second: 198, episode reward: -64.540, mean reward: -0.177 [-100.000, 18.680], mean action: 1.184 [0.000, 3.000], mean observation: 0.173 [-1.137, 1.040], loss: 12.484550, mean_absolute_error: 48.525986, mean_q: 64.462303
 224285/700000: episode: 564, duration: 1.690s, episode steps: 327, steps per second: 194, episode reward: 

 232707/700000: episode: 589, duration: 0.548s, episode steps: 107, steps per second: 195, episode reward: -90.706, mean reward: -0.848 [-100.000, 16.961], mean action: 1.654 [0.000, 3.000], mean observation: 0.036 [-1.753, 1.000], loss: 7.398994, mean_absolute_error: 47.992496, mean_q: 63.910412
 233337/700000: episode: 590, duration: 3.594s, episode steps: 630, steps per second: 175, episode reward: 235.346, mean reward: 0.374 [-19.192, 100.000], mean action: 0.894 [0.000, 3.000], mean observation: 0.175 [-1.019, 1.000], loss: 10.445923, mean_absolute_error: 48.177933, mean_q: 63.802479
 233568/700000: episode: 591, duration: 1.215s, episode steps: 231, steps per second: 190, episode reward: 246.895, mean reward: 1.069 [-11.204, 100.000], mean action: 1.446 [0.000, 3.000], mean observation: 0.136 [-0.757, 1.000], loss: 8.356945, mean_absolute_error: 48.281521, mean_q: 63.980934
 233934/700000: episode: 592, duration: 2.102s, episode steps: 366, steps per second: 174, episode reward: 

 241600/700000: episode: 617, duration: 1.822s, episode steps: 354, steps per second: 194, episode reward: 262.998, mean reward: 0.743 [-10.201, 100.000], mean action: 1.743 [0.000, 3.000], mean observation: 0.048 [-1.254, 1.004], loss: 11.533413, mean_absolute_error: 49.351299, mean_q: 65.932312
 241878/700000: episode: 618, duration: 1.405s, episode steps: 278, steps per second: 198, episode reward: 228.899, mean reward: 0.823 [-22.967, 100.000], mean action: 1.209 [0.000, 3.000], mean observation: 0.127 [-0.727, 1.000], loss: 12.447282, mean_absolute_error: 49.661476, mean_q: 66.359100
 242089/700000: episode: 619, duration: 1.059s, episode steps: 211, steps per second: 199, episode reward: 220.724, mean reward: 1.046 [-17.440, 100.000], mean action: 0.995 [0.000, 3.000], mean observation: 0.165 [-1.213, 1.000], loss: 8.782227, mean_absolute_error: 50.147751, mean_q: 66.978912
 242438/700000: episode: 620, duration: 1.786s, episode steps: 349, steps per second: 195, episode reward: 

 250575/700000: episode: 645, duration: 2.861s, episode steps: 553, steps per second: 193, episode reward: 190.146, mean reward: 0.344 [-18.066, 100.000], mean action: 0.915 [0.000, 3.000], mean observation: 0.198 [-0.909, 1.000], loss: 11.175149, mean_absolute_error: 51.238365, mean_q: 68.046089
 250984/700000: episode: 646, duration: 2.163s, episode steps: 409, steps per second: 189, episode reward: 224.186, mean reward: 0.548 [-18.856, 100.000], mean action: 1.071 [0.000, 3.000], mean observation: 0.137 [-0.852, 1.005], loss: 8.800639, mean_absolute_error: 51.311768, mean_q: 67.973839
 251439/700000: episode: 647, duration: 2.367s, episode steps: 455, steps per second: 192, episode reward: 195.843, mean reward: 0.430 [-17.780, 100.000], mean action: 0.635 [0.000, 3.000], mean observation: 0.164 [-0.889, 1.000], loss: 10.187439, mean_absolute_error: 51.369514, mean_q: 68.051239
 251793/700000: episode: 648, duration: 1.843s, episode steps: 354, steps per second: 192, episode reward: 

 262104/700000: episode: 673, duration: 3.800s, episode steps: 709, steps per second: 187, episode reward: 224.520, mean reward: 0.317 [-17.785, 100.000], mean action: 1.020 [0.000, 3.000], mean observation: 0.156 [-0.930, 1.206], loss: 8.569420, mean_absolute_error: 49.878571, mean_q: 66.432907
 262476/700000: episode: 674, duration: 1.921s, episode steps: 372, steps per second: 194, episode reward: 246.123, mean reward: 0.662 [-19.842, 100.000], mean action: 1.204 [0.000, 3.000], mean observation: 0.098 [-0.774, 1.000], loss: 8.155675, mean_absolute_error: 49.861778, mean_q: 66.296021
 262776/700000: episode: 675, duration: 1.541s, episode steps: 300, steps per second: 195, episode reward: -75.453, mean reward: -0.252 [-100.000, 10.697], mean action: 1.743 [0.000, 3.000], mean observation: -0.007 [-0.979, 3.300], loss: 9.936695, mean_absolute_error: 49.597092, mean_q: 65.857620
 262906/700000: episode: 676, duration: 0.662s, episode steps: 130, steps per second: 196, episode reward: 

 271720/700000: episode: 701, duration: 1.216s, episode steps: 240, steps per second: 197, episode reward: 199.291, mean reward: 0.830 [-17.444, 100.000], mean action: 1.050 [0.000, 3.000], mean observation: 0.084 [-0.766, 1.000], loss: 7.333984, mean_absolute_error: 49.699261, mean_q: 65.821381
 272282/700000: episode: 702, duration: 2.902s, episode steps: 562, steps per second: 194, episode reward: 220.091, mean reward: 0.392 [-19.707, 100.000], mean action: 1.062 [0.000, 3.000], mean observation: 0.198 [-0.796, 1.308], loss: 8.479438, mean_absolute_error: 49.907898, mean_q: 66.111771
 272811/700000: episode: 703, duration: 2.749s, episode steps: 529, steps per second: 192, episode reward: 209.918, mean reward: 0.397 [-17.539, 100.000], mean action: 0.915 [0.000, 3.000], mean observation: 0.150 [-1.172, 1.000], loss: 10.873364, mean_absolute_error: 49.908676, mean_q: 66.168861
 273426/700000: episode: 704, duration: 3.201s, episode steps: 615, steps per second: 192, episode reward: 2

 280922/700000: episode: 729, duration: 3.291s, episode steps: 623, steps per second: 189, episode reward: 227.807, mean reward: 0.366 [-20.171, 100.000], mean action: 1.289 [0.000, 3.000], mean observation: 0.042 [-0.808, 1.006], loss: 9.455032, mean_absolute_error: 50.168423, mean_q: 66.709961
 281082/700000: episode: 730, duration: 0.809s, episode steps: 160, steps per second: 198, episode reward: -30.043, mean reward: -0.188 [-100.000, 15.759], mean action: 1.494 [0.000, 3.000], mean observation: 0.025 [-0.933, 1.663], loss: 6.177604, mean_absolute_error: 49.973869, mean_q: 66.565437
 281443/700000: episode: 731, duration: 1.912s, episode steps: 361, steps per second: 189, episode reward: 217.948, mean reward: 0.604 [-3.295, 100.000], mean action: 1.632 [0.000, 3.000], mean observation: -0.010 [-0.921, 1.000], loss: 8.155828, mean_absolute_error: 50.466591, mean_q: 67.162941
 281674/700000: episode: 732, duration: 1.171s, episode steps: 231, steps per second: 197, episode reward: 2

 290178/700000: episode: 757, duration: 2.173s, episode steps: 407, steps per second: 187, episode reward: 234.484, mean reward: 0.576 [-16.923, 100.000], mean action: 1.263 [0.000, 3.000], mean observation: 0.184 [-0.871, 1.000], loss: 6.951761, mean_absolute_error: 49.974777, mean_q: 66.322723
 290388/700000: episode: 758, duration: 1.056s, episode steps: 210, steps per second: 199, episode reward: 212.433, mean reward: 1.012 [-9.833, 100.000], mean action: 1.371 [0.000, 3.000], mean observation: 0.071 [-0.671, 1.000], loss: 8.657633, mean_absolute_error: 50.168545, mean_q: 66.678253
 290613/700000: episode: 759, duration: 1.118s, episode steps: 225, steps per second: 201, episode reward: 219.707, mean reward: 0.976 [-3.074, 100.000], mean action: 0.956 [0.000, 3.000], mean observation: 0.102 [-0.823, 1.000], loss: 8.771860, mean_absolute_error: 50.400356, mean_q: 67.175339
 291045/700000: episode: 760, duration: 2.301s, episode steps: 432, steps per second: 188, episode reward: 239.

 299167/700000: episode: 785, duration: 1.968s, episode steps: 375, steps per second: 191, episode reward: 200.520, mean reward: 0.535 [-19.195, 100.000], mean action: 1.027 [0.000, 3.000], mean observation: 0.094 [-0.814, 1.000], loss: 13.713289, mean_absolute_error: 48.901302, mean_q: 64.936279
 299648/700000: episode: 786, duration: 2.535s, episode steps: 481, steps per second: 190, episode reward: 237.099, mean reward: 0.493 [-18.253, 100.000], mean action: 1.195 [0.000, 3.000], mean observation: 0.075 [-0.861, 1.000], loss: 7.381178, mean_absolute_error: 48.679119, mean_q: 64.786720
 300036/700000: episode: 787, duration: 2.018s, episode steps: 388, steps per second: 192, episode reward: 231.338, mean reward: 0.596 [-17.738, 100.000], mean action: 1.098 [0.000, 3.000], mean observation: 0.148 [-0.823, 1.000], loss: 5.760766, mean_absolute_error: 49.024410, mean_q: 65.132523
 300611/700000: episode: 788, duration: 2.981s, episode steps: 575, steps per second: 193, episode reward: 2

 310076/700000: episode: 813, duration: 1.557s, episode steps: 303, steps per second: 195, episode reward: 230.178, mean reward: 0.760 [-20.343, 100.000], mean action: 1.165 [0.000, 3.000], mean observation: 0.067 [-1.101, 1.000], loss: 9.716264, mean_absolute_error: 48.571899, mean_q: 64.726166
 310439/700000: episode: 814, duration: 2.027s, episode steps: 363, steps per second: 179, episode reward: 241.256, mean reward: 0.665 [-17.296, 100.000], mean action: 0.871 [0.000, 3.000], mean observation: 0.164 [-1.015, 1.000], loss: 10.537494, mean_absolute_error: 48.606281, mean_q: 65.022949
 310888/700000: episode: 815, duration: 2.358s, episode steps: 449, steps per second: 190, episode reward: 252.659, mean reward: 0.563 [-19.405, 100.000], mean action: 1.147 [0.000, 3.000], mean observation: 0.148 [-0.646, 1.268], loss: 7.016394, mean_absolute_error: 48.818336, mean_q: 64.991425
 311152/700000: episode: 816, duration: 1.339s, episode steps: 264, steps per second: 197, episode reward: 2

 320017/700000: episode: 841, duration: 1.632s, episode steps: 319, steps per second: 195, episode reward: 241.117, mean reward: 0.756 [-11.458, 100.000], mean action: 1.038 [0.000, 3.000], mean observation: 0.125 [-1.437, 1.000], loss: 10.206387, mean_absolute_error: 48.625488, mean_q: 64.451653
 320210/700000: episode: 842, duration: 0.984s, episode steps: 193, steps per second: 196, episode reward: 255.224, mean reward: 1.322 [-8.431, 100.000], mean action: 1.513 [0.000, 3.000], mean observation: 0.023 [-1.435, 1.000], loss: 9.118364, mean_absolute_error: 48.732327, mean_q: 64.858345
 320448/700000: episode: 843, duration: 1.208s, episode steps: 238, steps per second: 197, episode reward: 223.441, mean reward: 0.939 [-6.413, 100.000], mean action: 1.319 [0.000, 3.000], mean observation: 0.049 [-0.972, 1.000], loss: 7.310735, mean_absolute_error: 48.676144, mean_q: 64.556831
 321184/700000: episode: 844, duration: 3.802s, episode steps: 736, steps per second: 194, episode reward: 217

 329166/700000: episode: 869, duration: 1.055s, episode steps: 207, steps per second: 196, episode reward: 237.143, mean reward: 1.146 [-8.140, 100.000], mean action: 1.145 [0.000, 3.000], mean observation: 0.016 [-1.086, 1.366], loss: 7.415489, mean_absolute_error: 48.426056, mean_q: 64.095230
 329407/700000: episode: 870, duration: 1.219s, episode steps: 241, steps per second: 198, episode reward: 240.398, mean reward: 0.998 [-7.206, 100.000], mean action: 1.473 [0.000, 3.000], mean observation: 0.027 [-0.837, 1.000], loss: 7.390568, mean_absolute_error: 48.346375, mean_q: 64.165878
 329739/700000: episode: 871, duration: 1.698s, episode steps: 332, steps per second: 196, episode reward: 226.203, mean reward: 0.681 [-17.795, 100.000], mean action: 1.117 [0.000, 3.000], mean observation: 0.140 [-0.990, 1.000], loss: 7.091640, mean_absolute_error: 48.672291, mean_q: 64.688148
 329985/700000: episode: 872, duration: 1.244s, episode steps: 246, steps per second: 198, episode reward: 260.

 338024/700000: episode: 897, duration: 2.067s, episode steps: 399, steps per second: 193, episode reward: 195.192, mean reward: 0.489 [-14.091, 100.000], mean action: 0.937 [0.000, 3.000], mean observation: 0.110 [-0.960, 1.000], loss: 11.440192, mean_absolute_error: 48.331543, mean_q: 63.952724
 338786/700000: episode: 898, duration: 4.010s, episode steps: 762, steps per second: 190, episode reward: 234.358, mean reward: 0.308 [-20.494, 100.000], mean action: 0.591 [0.000, 3.000], mean observation: 0.211 [-0.940, 1.000], loss: 13.541261, mean_absolute_error: 48.461845, mean_q: 64.377945
 339786/700000: episode: 899, duration: 5.578s, episode steps: 1000, steps per second: 179, episode reward: 60.998, mean reward: 0.061 [-19.683, 22.367], mean action: 1.107 [0.000, 3.000], mean observation: 0.211 [-0.998, 1.000], loss: 8.763931, mean_absolute_error: 48.089725, mean_q: 63.954979
 340267/700000: episode: 900, duration: 2.436s, episode steps: 481, steps per second: 197, episode reward: 2

 348760/700000: episode: 925, duration: 1.909s, episode steps: 370, steps per second: 194, episode reward: 205.046, mean reward: 0.554 [-24.240, 100.000], mean action: 0.781 [0.000, 3.000], mean observation: 0.157 [-0.904, 1.000], loss: 11.217494, mean_absolute_error: 50.194088, mean_q: 66.417557
 349197/700000: episode: 926, duration: 2.333s, episode steps: 437, steps per second: 187, episode reward: 220.751, mean reward: 0.505 [-19.211, 100.000], mean action: 0.744 [0.000, 3.000], mean observation: 0.179 [-0.998, 1.000], loss: 9.160974, mean_absolute_error: 50.299030, mean_q: 66.694687
 349456/700000: episode: 927, duration: 1.307s, episode steps: 259, steps per second: 198, episode reward: 232.333, mean reward: 0.897 [-10.417, 100.000], mean action: 1.595 [0.000, 3.000], mean observation: 0.099 [-0.923, 1.000], loss: 7.386410, mean_absolute_error: 50.440975, mean_q: 67.050125
 349537/700000: episode: 928, duration: 0.408s, episode steps: 81, steps per second: 199, episode reward: -7

 356551/700000: episode: 953, duration: 1.250s, episode steps: 245, steps per second: 196, episode reward: 233.256, mean reward: 0.952 [-12.198, 100.000], mean action: 0.967 [0.000, 3.000], mean observation: 0.093 [-0.889, 1.030], loss: 7.049412, mean_absolute_error: 50.500408, mean_q: 66.997917
 356819/700000: episode: 954, duration: 1.394s, episode steps: 268, steps per second: 192, episode reward: 230.726, mean reward: 0.861 [-10.523, 100.000], mean action: 1.175 [0.000, 3.000], mean observation: 0.058 [-0.873, 1.000], loss: 9.667807, mean_absolute_error: 50.880554, mean_q: 67.365829
 356939/700000: episode: 955, duration: 0.605s, episode steps: 120, steps per second: 198, episode reward: -31.273, mean reward: -0.261 [-100.000, 10.139], mean action: 1.583 [0.000, 3.000], mean observation: 0.033 [-1.264, 1.000], loss: 3.332143, mean_absolute_error: 51.100529, mean_q: 67.741936
 357199/700000: episode: 956, duration: 1.304s, episode steps: 260, steps per second: 199, episode reward: 2

 364227/700000: episode: 981, duration: 1.274s, episode steps: 252, steps per second: 198, episode reward: 189.003, mean reward: 0.750 [-14.509, 100.000], mean action: 1.194 [0.000, 3.000], mean observation: 0.148 [-0.660, 1.114], loss: 11.203276, mean_absolute_error: 51.028629, mean_q: 67.592674
 364713/700000: episode: 982, duration: 2.566s, episode steps: 486, steps per second: 189, episode reward: 199.637, mean reward: 0.411 [-17.695, 100.000], mean action: 1.134 [0.000, 3.000], mean observation: 0.132 [-0.706, 1.000], loss: 8.551657, mean_absolute_error: 51.012749, mean_q: 67.977013
 365015/700000: episode: 983, duration: 1.573s, episode steps: 302, steps per second: 192, episode reward: 201.357, mean reward: 0.667 [-10.953, 100.000], mean action: 1.245 [0.000, 3.000], mean observation: 0.070 [-0.589, 1.000], loss: 8.655317, mean_absolute_error: 50.947617, mean_q: 67.464424
 365246/700000: episode: 984, duration: 1.184s, episode steps: 231, steps per second: 195, episode reward: 2

 372203/700000: episode: 1009, duration: 1.080s, episode steps: 210, steps per second: 194, episode reward: 183.917, mean reward: 0.876 [-8.060, 100.000], mean action: 1.467 [0.000, 3.000], mean observation: 0.163 [-1.050, 1.000], loss: 7.915995, mean_absolute_error: 50.139359, mean_q: 66.675522
 372476/700000: episode: 1010, duration: 1.395s, episode steps: 273, steps per second: 196, episode reward: 235.282, mean reward: 0.862 [-8.922, 100.000], mean action: 0.967 [0.000, 3.000], mean observation: 0.112 [-1.068, 1.009], loss: 8.304250, mean_absolute_error: 49.995129, mean_q: 66.511253
 372646/700000: episode: 1011, duration: 0.863s, episode steps: 170, steps per second: 197, episode reward: 237.999, mean reward: 1.400 [-9.388, 100.000], mean action: 1.147 [0.000, 3.000], mean observation: 0.015 [-1.082, 1.000], loss: 12.031823, mean_absolute_error: 49.974018, mean_q: 65.950600
 372843/700000: episode: 1012, duration: 1.017s, episode steps: 197, steps per second: 194, episode reward: 

 378753/700000: episode: 1037, duration: 1.463s, episode steps: 281, steps per second: 192, episode reward: 231.222, mean reward: 0.823 [-17.480, 100.000], mean action: 0.890 [0.000, 3.000], mean observation: 0.107 [-0.768, 1.000], loss: 10.075521, mean_absolute_error: 51.687511, mean_q: 68.310867
 379165/700000: episode: 1038, duration: 2.136s, episode steps: 412, steps per second: 193, episode reward: 219.482, mean reward: 0.533 [-18.375, 100.000], mean action: 0.995 [0.000, 3.000], mean observation: 0.103 [-0.887, 1.000], loss: 6.205854, mean_absolute_error: 51.118923, mean_q: 67.855858
 379647/700000: episode: 1039, duration: 2.515s, episode steps: 482, steps per second: 192, episode reward: 228.961, mean reward: 0.475 [-20.233, 100.000], mean action: 0.612 [0.000, 3.000], mean observation: 0.182 [-0.801, 1.000], loss: 10.308460, mean_absolute_error: 50.629124, mean_q: 67.307671
 380126/700000: episode: 1040, duration: 2.752s, episode steps: 479, steps per second: 174, episode rewa

 388481/700000: episode: 1065, duration: 1.688s, episode steps: 329, steps per second: 195, episode reward: 218.194, mean reward: 0.663 [-20.419, 100.000], mean action: 0.827 [0.000, 3.000], mean observation: 0.165 [-0.927, 1.000], loss: 7.661958, mean_absolute_error: 50.362392, mean_q: 66.952904
 388763/700000: episode: 1066, duration: 1.448s, episode steps: 282, steps per second: 195, episode reward: 255.186, mean reward: 0.905 [-8.427, 100.000], mean action: 1.128 [0.000, 3.000], mean observation: 0.096 [-0.889, 1.015], loss: 8.362241, mean_absolute_error: 49.894592, mean_q: 66.394882
 388950/700000: episode: 1067, duration: 0.944s, episode steps: 187, steps per second: 198, episode reward: 234.553, mean reward: 1.254 [-9.497, 100.000], mean action: 1.369 [0.000, 3.000], mean observation: 0.091 [-0.794, 1.000], loss: 4.384242, mean_absolute_error: 49.994568, mean_q: 66.239990
 389262/700000: episode: 1068, duration: 1.644s, episode steps: 312, steps per second: 190, episode reward: 

 397910/700000: episode: 1093, duration: 1.212s, episode steps: 243, steps per second: 200, episode reward: 230.415, mean reward: 0.948 [-10.843, 100.000], mean action: 0.914 [0.000, 3.000], mean observation: 0.088 [-1.055, 1.000], loss: 6.923079, mean_absolute_error: 49.971489, mean_q: 66.519676
 398180/700000: episode: 1094, duration: 1.441s, episode steps: 270, steps per second: 187, episode reward: 211.337, mean reward: 0.783 [-9.956, 100.000], mean action: 1.285 [0.000, 3.000], mean observation: 0.118 [-0.907, 1.000], loss: 6.446816, mean_absolute_error: 50.410225, mean_q: 66.807770
 398459/700000: episode: 1095, duration: 1.419s, episode steps: 279, steps per second: 197, episode reward: 248.524, mean reward: 0.891 [-10.334, 100.000], mean action: 1.434 [0.000, 3.000], mean observation: 0.046 [-0.780, 1.000], loss: 6.297441, mean_absolute_error: 49.825127, mean_q: 66.386253
 398672/700000: episode: 1096, duration: 1.063s, episode steps: 213, steps per second: 200, episode reward:

 408319/700000: episode: 1121, duration: 2.648s, episode steps: 485, steps per second: 183, episode reward: 233.895, mean reward: 0.482 [-22.204, 100.000], mean action: 1.390 [0.000, 3.000], mean observation: 0.119 [-0.863, 1.008], loss: 11.807820, mean_absolute_error: 50.523674, mean_q: 67.421394
 408846/700000: episode: 1122, duration: 2.709s, episode steps: 527, steps per second: 195, episode reward: 232.585, mean reward: 0.441 [-20.302, 100.000], mean action: 0.554 [0.000, 3.000], mean observation: 0.199 [-0.961, 1.000], loss: 6.761757, mean_absolute_error: 50.888382, mean_q: 67.939445
 409037/700000: episode: 1123, duration: 0.951s, episode steps: 191, steps per second: 201, episode reward: 214.282, mean reward: 1.122 [-7.919, 100.000], mean action: 1.058 [0.000, 3.000], mean observation: 0.063 [-0.951, 1.000], loss: 6.185714, mean_absolute_error: 51.453125, mean_q: 68.592537
 409417/700000: episode: 1124, duration: 1.972s, episode steps: 380, steps per second: 193, episode reward

 417886/700000: episode: 1149, duration: 1.149s, episode steps: 229, steps per second: 199, episode reward: 212.806, mean reward: 0.929 [-8.127, 100.000], mean action: 1.293 [0.000, 3.000], mean observation: 0.112 [-1.102, 1.000], loss: 8.062881, mean_absolute_error: 51.609703, mean_q: 68.496788
 418106/700000: episode: 1150, duration: 1.112s, episode steps: 220, steps per second: 198, episode reward: 203.727, mean reward: 0.926 [-17.600, 100.000], mean action: 1.245 [0.000, 3.000], mean observation: 0.112 [-0.866, 1.093], loss: 9.357922, mean_absolute_error: 51.419376, mean_q: 68.504753
 418323/700000: episode: 1151, duration: 1.088s, episode steps: 217, steps per second: 199, episode reward: 255.712, mean reward: 1.178 [-7.663, 100.000], mean action: 1.346 [0.000, 3.000], mean observation: 0.124 [-0.652, 1.000], loss: 4.958746, mean_absolute_error: 51.425880, mean_q: 68.488190
 418589/700000: episode: 1152, duration: 1.347s, episode steps: 266, steps per second: 197, episode reward: 

 426607/700000: episode: 1177, duration: 1.342s, episode steps: 267, steps per second: 199, episode reward: 225.581, mean reward: 0.845 [-3.780, 100.000], mean action: 0.914 [0.000, 3.000], mean observation: 0.150 [-0.950, 1.000], loss: 10.733925, mean_absolute_error: 52.156048, mean_q: 69.288620
 427038/700000: episode: 1178, duration: 2.339s, episode steps: 431, steps per second: 184, episode reward: 217.643, mean reward: 0.505 [-19.332, 100.000], mean action: 1.183 [0.000, 3.000], mean observation: 0.134 [-0.959, 1.036], loss: 7.844600, mean_absolute_error: 52.153339, mean_q: 69.292870
 427429/700000: episode: 1179, duration: 2.037s, episode steps: 391, steps per second: 192, episode reward: 222.395, mean reward: 0.569 [-17.443, 100.000], mean action: 1.074 [0.000, 3.000], mean observation: 0.114 [-1.070, 1.000], loss: 9.254855, mean_absolute_error: 51.977402, mean_q: 68.796379
 428000/700000: episode: 1180, duration: 3.301s, episode steps: 571, steps per second: 173, episode reward

 436985/700000: episode: 1205, duration: 1.443s, episode steps: 281, steps per second: 195, episode reward: 205.801, mean reward: 0.732 [-17.547, 100.000], mean action: 1.836 [0.000, 3.000], mean observation: 0.102 [-1.110, 1.000], loss: 11.272510, mean_absolute_error: 51.247066, mean_q: 68.254341
 437161/700000: episode: 1206, duration: 0.959s, episode steps: 176, steps per second: 184, episode reward: -97.104, mean reward: -0.552 [-100.000, 10.936], mean action: 1.773 [0.000, 3.000], mean observation: -0.091 [-1.353, 1.000], loss: 8.294566, mean_absolute_error: 51.025028, mean_q: 68.050850
 437521/700000: episode: 1207, duration: 2.095s, episode steps: 360, steps per second: 172, episode reward: 244.149, mean reward: 0.678 [-17.340, 100.000], mean action: 1.278 [0.000, 3.000], mean observation: 0.053 [-0.780, 1.211], loss: 7.900784, mean_absolute_error: 50.928856, mean_q: 67.543861
 437914/700000: episode: 1208, duration: 2.167s, episode steps: 393, steps per second: 181, episode rew

 444967/700000: episode: 1233, duration: 1.844s, episode steps: 347, steps per second: 188, episode reward: 196.757, mean reward: 0.567 [-21.414, 100.000], mean action: 0.931 [0.000, 3.000], mean observation: 0.139 [-0.903, 1.000], loss: 7.883616, mean_absolute_error: 50.716801, mean_q: 67.454102
 445086/700000: episode: 1234, duration: 0.605s, episode steps: 119, steps per second: 197, episode reward: 12.846, mean reward: 0.108 [-100.000, 16.783], mean action: 1.639 [0.000, 3.000], mean observation: -0.050 [-0.891, 1.000], loss: 9.661957, mean_absolute_error: 50.181866, mean_q: 66.979057
 445256/700000: episode: 1235, duration: 0.849s, episode steps: 170, steps per second: 200, episode reward: 233.781, mean reward: 1.375 [-17.889, 100.000], mean action: 1.276 [0.000, 3.000], mean observation: 0.047 [-0.963, 1.000], loss: 6.152945, mean_absolute_error: 50.729858, mean_q: 67.729202
 445477/700000: episode: 1236, duration: 1.111s, episode steps: 221, steps per second: 199, episode reward

 452327/700000: episode: 1261, duration: 1.518s, episode steps: 294, steps per second: 194, episode reward: 224.410, mean reward: 0.763 [-19.237, 100.000], mean action: 0.993 [0.000, 3.000], mean observation: 0.110 [-0.800, 1.000], loss: 34.205502, mean_absolute_error: 50.412724, mean_q: 67.066864
 452498/700000: episode: 1262, duration: 0.851s, episode steps: 171, steps per second: 201, episode reward: -2.023, mean reward: -0.012 [-100.000, 15.086], mean action: 1.544 [0.000, 3.000], mean observation: 0.018 [-1.062, 1.122], loss: 10.668517, mean_absolute_error: 50.299767, mean_q: 67.272263
 452703/700000: episode: 1263, duration: 1.022s, episode steps: 205, steps per second: 201, episode reward: 235.276, mean reward: 1.148 [-19.099, 100.000], mean action: 1.122 [0.000, 3.000], mean observation: 0.123 [-0.869, 1.000], loss: 5.770717, mean_absolute_error: 50.443741, mean_q: 67.395355
 452876/700000: episode: 1264, duration: 0.868s, episode steps: 173, steps per second: 199, episode rewa

 461596/700000: episode: 1289, duration: 2.248s, episode steps: 384, steps per second: 171, episode reward: 236.979, mean reward: 0.617 [-9.982, 100.000], mean action: 1.357 [0.000, 3.000], mean observation: 0.006 [-0.934, 1.000], loss: 7.621586, mean_absolute_error: 51.947063, mean_q: 69.468376
 461789/700000: episode: 1290, duration: 1.006s, episode steps: 193, steps per second: 192, episode reward: 228.653, mean reward: 1.185 [-9.119, 100.000], mean action: 1.171 [0.000, 3.000], mean observation: 0.110 [-1.005, 1.000], loss: 9.742319, mean_absolute_error: 52.066113, mean_q: 69.692612
 462243/700000: episode: 1291, duration: 2.536s, episode steps: 454, steps per second: 179, episode reward: 212.554, mean reward: 0.468 [-18.839, 100.000], mean action: 0.943 [0.000, 3.000], mean observation: 0.190 [-1.004, 1.000], loss: 6.912411, mean_absolute_error: 51.864048, mean_q: 69.065025
 462508/700000: episode: 1292, duration: 1.351s, episode steps: 265, steps per second: 196, episode reward: 

 471116/700000: episode: 1317, duration: 1.555s, episode steps: 305, steps per second: 196, episode reward: 207.159, mean reward: 0.679 [-7.915, 100.000], mean action: 0.964 [0.000, 3.000], mean observation: 0.117 [-0.904, 1.000], loss: 10.270650, mean_absolute_error: 50.255768, mean_q: 67.006752
 471203/700000: episode: 1318, duration: 0.438s, episode steps: 87, steps per second: 199, episode reward: -14.413, mean reward: -0.166 [-100.000, 21.764], mean action: 1.552 [0.000, 3.000], mean observation: 0.015 [-1.113, 1.000], loss: 7.209741, mean_absolute_error: 50.497074, mean_q: 67.203667
 471355/700000: episode: 1319, duration: 0.761s, episode steps: 152, steps per second: 200, episode reward: 241.947, mean reward: 1.592 [-9.917, 100.000], mean action: 1.296 [0.000, 3.000], mean observation: 0.057 [-1.133, 1.000], loss: 16.330217, mean_absolute_error: 50.348709, mean_q: 67.185661
 471466/700000: episode: 1320, duration: 0.558s, episode steps: 111, steps per second: 199, episode reward

 477685/700000: episode: 1345, duration: 1.214s, episode steps: 238, steps per second: 196, episode reward: 252.560, mean reward: 1.061 [-9.982, 100.000], mean action: 1.139 [0.000, 3.000], mean observation: 0.109 [-0.940, 1.455], loss: 10.291264, mean_absolute_error: 51.569916, mean_q: 68.862282
 477887/700000: episode: 1346, duration: 1.012s, episode steps: 202, steps per second: 200, episode reward: 224.873, mean reward: 1.113 [-2.888, 100.000], mean action: 0.975 [0.000, 3.000], mean observation: 0.121 [-0.971, 1.000], loss: 9.644251, mean_absolute_error: 51.409565, mean_q: 68.512230
 478183/700000: episode: 1347, duration: 1.485s, episode steps: 296, steps per second: 199, episode reward: 244.260, mean reward: 0.825 [-19.890, 100.000], mean action: 0.966 [0.000, 3.000], mean observation: 0.101 [-0.935, 1.000], loss: 6.916751, mean_absolute_error: 51.447666, mean_q: 68.606895
 478637/700000: episode: 1348, duration: 2.378s, episode steps: 454, steps per second: 191, episode reward:

 486642/700000: episode: 1373, duration: 6.495s, episode steps: 1000, steps per second: 154, episode reward: -50.488, mean reward: -0.050 [-5.235, 6.301], mean action: 1.765 [0.000, 3.000], mean observation: -0.027 [-0.811, 0.939], loss: 10.052736, mean_absolute_error: 51.234318, mean_q: 68.169327
 486919/700000: episode: 1374, duration: 1.431s, episode steps: 277, steps per second: 194, episode reward: 216.671, mean reward: 0.782 [-9.500, 100.000], mean action: 1.014 [0.000, 3.000], mean observation: 0.115 [-1.086, 1.000], loss: 8.724457, mean_absolute_error: 50.713596, mean_q: 67.443375
 487919/700000: episode: 1375, duration: 5.702s, episode steps: 1000, steps per second: 175, episode reward: 38.542, mean reward: 0.039 [-19.824, 22.875], mean action: 1.536 [0.000, 3.000], mean observation: 0.134 [-0.828, 1.000], loss: 8.283174, mean_absolute_error: 50.714256, mean_q: 67.701080
 488919/700000: episode: 1376, duration: 5.858s, episode steps: 1000, steps per second: 171, episode reward

 496579/700000: episode: 1401, duration: 0.977s, episode steps: 195, steps per second: 200, episode reward: 242.190, mean reward: 1.242 [-9.423, 100.000], mean action: 1.072 [0.000, 3.000], mean observation: 0.090 [-0.995, 1.060], loss: 10.138759, mean_absolute_error: 51.577122, mean_q: 68.847404
 496808/700000: episode: 1402, duration: 1.153s, episode steps: 229, steps per second: 199, episode reward: 223.093, mean reward: 0.974 [-8.951, 100.000], mean action: 1.319 [0.000, 3.000], mean observation: 0.104 [-0.825, 1.461], loss: 9.178043, mean_absolute_error: 51.231293, mean_q: 68.575317
 497180/700000: episode: 1403, duration: 1.883s, episode steps: 372, steps per second: 198, episode reward: 232.794, mean reward: 0.626 [-22.999, 100.000], mean action: 0.723 [0.000, 3.000], mean observation: 0.170 [-1.094, 1.236], loss: 13.641322, mean_absolute_error: 51.344742, mean_q: 68.654610
 497393/700000: episode: 1404, duration: 1.072s, episode steps: 213, steps per second: 199, episode reward

 503352/700000: episode: 1429, duration: 1.209s, episode steps: 241, steps per second: 199, episode reward: 247.774, mean reward: 1.028 [-6.238, 100.000], mean action: 1.100 [0.000, 3.000], mean observation: 0.029 [-0.752, 1.000], loss: 19.258873, mean_absolute_error: 51.021412, mean_q: 68.149498
 503595/700000: episode: 1430, duration: 1.235s, episode steps: 243, steps per second: 197, episode reward: 235.279, mean reward: 0.968 [-11.134, 100.000], mean action: 0.996 [0.000, 3.000], mean observation: 0.063 [-0.777, 1.158], loss: 10.038820, mean_absolute_error: 51.592415, mean_q: 69.048164
 503854/700000: episode: 1431, duration: 1.315s, episode steps: 259, steps per second: 197, episode reward: 195.601, mean reward: 0.755 [-19.874, 100.000], mean action: 1.216 [0.000, 3.000], mean observation: 0.136 [-0.934, 1.000], loss: 24.822142, mean_absolute_error: 51.536869, mean_q: 68.998344
 503969/700000: episode: 1432, duration: 0.575s, episode steps: 115, steps per second: 200, episode rewa

 511608/700000: episode: 1457, duration: 1.600s, episode steps: 312, steps per second: 195, episode reward: 257.429, mean reward: 0.825 [-11.894, 100.000], mean action: 1.250 [0.000, 3.000], mean observation: 0.086 [-1.551, 1.072], loss: 10.160022, mean_absolute_error: 51.470345, mean_q: 68.877502
 511879/700000: episode: 1458, duration: 1.354s, episode steps: 271, steps per second: 200, episode reward: 265.170, mean reward: 0.978 [-12.384, 100.000], mean action: 0.985 [0.000, 3.000], mean observation: 0.141 [-0.923, 1.023], loss: 12.360262, mean_absolute_error: 51.471771, mean_q: 68.831558
 512278/700000: episode: 1459, duration: 2.020s, episode steps: 399, steps per second: 198, episode reward: 250.772, mean reward: 0.629 [-9.041, 100.000], mean action: 0.980 [0.000, 3.000], mean observation: 0.111 [-0.935, 1.000], loss: 7.245112, mean_absolute_error: 51.335678, mean_q: 68.734764
 512360/700000: episode: 1460, duration: 0.421s, episode steps: 82, steps per second: 195, episode reward

 519728/700000: episode: 1485, duration: 1.118s, episode steps: 220, steps per second: 197, episode reward: 216.194, mean reward: 0.983 [-18.702, 100.000], mean action: 0.955 [0.000, 3.000], mean observation: 0.101 [-0.931, 1.000], loss: 8.349031, mean_absolute_error: 50.765240, mean_q: 67.815178
 520087/700000: episode: 1486, duration: 1.894s, episode steps: 359, steps per second: 190, episode reward: 207.764, mean reward: 0.579 [-19.459, 100.000], mean action: 2.245 [0.000, 3.000], mean observation: 0.197 [-0.839, 1.000], loss: 13.324205, mean_absolute_error: 50.930130, mean_q: 67.941650
 520361/700000: episode: 1487, duration: 1.405s, episode steps: 274, steps per second: 195, episode reward: 236.348, mean reward: 0.863 [-6.327, 100.000], mean action: 1.785 [0.000, 3.000], mean observation: 0.206 [-0.875, 1.378], loss: 11.107065, mean_absolute_error: 51.047260, mean_q: 68.299232
 520661/700000: episode: 1488, duration: 1.510s, episode steps: 300, steps per second: 199, episode rewar

 527921/700000: episode: 1513, duration: 2.264s, episode steps: 430, steps per second: 190, episode reward: 230.104, mean reward: 0.535 [-24.340, 100.000], mean action: 1.021 [0.000, 3.000], mean observation: 0.114 [-1.026, 1.000], loss: 8.473076, mean_absolute_error: 53.017689, mean_q: 70.654800
 528244/700000: episode: 1514, duration: 1.689s, episode steps: 323, steps per second: 191, episode reward: 245.165, mean reward: 0.759 [-8.739, 100.000], mean action: 1.077 [0.000, 3.000], mean observation: 0.125 [-0.878, 1.000], loss: 12.692527, mean_absolute_error: 52.457272, mean_q: 69.836014
 528561/700000: episode: 1515, duration: 1.606s, episode steps: 317, steps per second: 197, episode reward: 209.894, mean reward: 0.662 [-18.478, 100.000], mean action: 0.801 [0.000, 3.000], mean observation: 0.147 [-1.020, 1.000], loss: 9.813458, mean_absolute_error: 52.606258, mean_q: 70.287903
 528894/700000: episode: 1516, duration: 1.670s, episode steps: 333, steps per second: 199, episode reward

 537088/700000: episode: 1541, duration: 1.843s, episode steps: 354, steps per second: 192, episode reward: 228.274, mean reward: 0.645 [-11.147, 100.000], mean action: 1.172 [0.000, 3.000], mean observation: 0.116 [-0.914, 1.000], loss: 9.348117, mean_absolute_error: 52.076778, mean_q: 69.720055
 537381/700000: episode: 1542, duration: 1.490s, episode steps: 293, steps per second: 197, episode reward: 222.733, mean reward: 0.760 [-17.607, 100.000], mean action: 1.171 [0.000, 3.000], mean observation: 0.121 [-0.916, 1.000], loss: 9.169442, mean_absolute_error: 52.560524, mean_q: 70.275520
 537694/700000: episode: 1543, duration: 1.592s, episode steps: 313, steps per second: 197, episode reward: 258.446, mean reward: 0.826 [-4.898, 100.000], mean action: 1.188 [0.000, 3.000], mean observation: 0.088 [-0.759, 1.000], loss: 8.374727, mean_absolute_error: 52.628384, mean_q: 70.098457
 538140/700000: episode: 1544, duration: 2.399s, episode steps: 446, steps per second: 186, episode reward:

 544498/700000: episode: 1569, duration: 1.439s, episode steps: 283, steps per second: 197, episode reward: 246.352, mean reward: 0.871 [-21.618, 100.000], mean action: 1.035 [0.000, 3.000], mean observation: 0.127 [-0.841, 1.000], loss: 7.922506, mean_absolute_error: 52.659672, mean_q: 70.563217
 544727/700000: episode: 1570, duration: 1.161s, episode steps: 229, steps per second: 197, episode reward: 197.798, mean reward: 0.864 [-20.065, 100.000], mean action: 1.279 [0.000, 3.000], mean observation: 0.091 [-0.953, 1.000], loss: 8.566310, mean_absolute_error: 52.848400, mean_q: 70.828674
 545014/700000: episode: 1571, duration: 1.471s, episode steps: 287, steps per second: 195, episode reward: 242.943, mean reward: 0.846 [-4.202, 100.000], mean action: 1.098 [0.000, 3.000], mean observation: 0.162 [-0.891, 1.092], loss: 4.787218, mean_absolute_error: 52.473881, mean_q: 70.252968
 546014/700000: episode: 1572, duration: 5.895s, episode steps: 1000, steps per second: 170, episode reward

 553320/700000: episode: 1597, duration: 2.185s, episode steps: 424, steps per second: 194, episode reward: 249.165, mean reward: 0.588 [-19.674, 100.000], mean action: 1.651 [0.000, 3.000], mean observation: 0.132 [-1.033, 1.017], loss: 7.124447, mean_absolute_error: 52.166393, mean_q: 69.984344
 553471/700000: episode: 1598, duration: 0.759s, episode steps: 151, steps per second: 199, episode reward: 50.491, mean reward: 0.334 [-100.000, 15.146], mean action: 1.682 [0.000, 3.000], mean observation: -0.026 [-0.813, 1.000], loss: 8.109415, mean_absolute_error: 52.738121, mean_q: 70.756813
 553639/700000: episode: 1599, duration: 0.846s, episode steps: 168, steps per second: 199, episode reward: 201.473, mean reward: 1.199 [-9.449, 100.000], mean action: 1.554 [0.000, 3.000], mean observation: 0.072 [-0.992, 1.000], loss: 10.135277, mean_absolute_error: 53.070511, mean_q: 71.175804
 553911/700000: episode: 1600, duration: 1.401s, episode steps: 272, steps per second: 194, episode reward

 560802/700000: episode: 1625, duration: 0.959s, episode steps: 190, steps per second: 198, episode reward: 227.174, mean reward: 1.196 [-3.782, 100.000], mean action: 1.253 [0.000, 3.000], mean observation: 0.145 [-0.804, 1.000], loss: 6.158496, mean_absolute_error: 52.725620, mean_q: 70.756569
 560930/700000: episode: 1626, duration: 0.647s, episode steps: 128, steps per second: 198, episode reward: -7.186, mean reward: -0.056 [-100.000, 29.509], mean action: 1.391 [0.000, 3.000], mean observation: -0.021 [-0.856, 1.000], loss: 13.717489, mean_absolute_error: 52.589653, mean_q: 70.651260
 561307/700000: episode: 1627, duration: 1.970s, episode steps: 377, steps per second: 191, episode reward: 193.239, mean reward: 0.513 [-17.723, 100.000], mean action: 1.220 [0.000, 3.000], mean observation: 0.103 [-0.581, 1.000], loss: 8.679715, mean_absolute_error: 52.948746, mean_q: 70.669991
 561677/700000: episode: 1628, duration: 1.937s, episode steps: 370, steps per second: 191, episode rewar

 567055/700000: episode: 1653, duration: 0.666s, episode steps: 129, steps per second: 194, episode reward: 17.452, mean reward: 0.135 [-100.000, 15.770], mean action: 1.690 [0.000, 3.000], mean observation: 0.059 [-0.993, 1.000], loss: 15.289721, mean_absolute_error: 52.098419, mean_q: 69.435440
 567304/700000: episode: 1654, duration: 1.257s, episode steps: 249, steps per second: 198, episode reward: 246.521, mean reward: 0.990 [-10.351, 100.000], mean action: 1.020 [0.000, 3.000], mean observation: 0.078 [-1.281, 1.000], loss: 12.792688, mean_absolute_error: 51.249249, mean_q: 68.623886
 567487/700000: episode: 1655, duration: 0.940s, episode steps: 183, steps per second: 195, episode reward: -15.179, mean reward: -0.083 [-100.000, 21.135], mean action: 1.727 [0.000, 3.000], mean observation: 0.038 [-1.078, 1.000], loss: 11.958326, mean_absolute_error: 51.037739, mean_q: 68.285110
 567800/700000: episode: 1656, duration: 1.619s, episode steps: 313, steps per second: 193, episode rew

 575295/700000: episode: 1681, duration: 0.554s, episode steps: 103, steps per second: 186, episode reward: -42.333, mean reward: -0.411 [-100.000, 10.996], mean action: 1.932 [0.000, 3.000], mean observation: 0.120 [-0.971, 1.000], loss: 9.269746, mean_absolute_error: 52.033535, mean_q: 69.738403
 576295/700000: episode: 1682, duration: 5.775s, episode steps: 1000, steps per second: 173, episode reward: -14.959, mean reward: -0.015 [-18.833, 20.431], mean action: 1.036 [0.000, 3.000], mean observation: 0.174 [-1.010, 1.000], loss: 8.913031, mean_absolute_error: 51.478661, mean_q: 68.998779
 576840/700000: episode: 1683, duration: 2.891s, episode steps: 545, steps per second: 189, episode reward: 214.004, mean reward: 0.393 [-19.656, 100.000], mean action: 0.954 [0.000, 3.000], mean observation: 0.165 [-1.041, 1.000], loss: 10.511380, mean_absolute_error: 51.808327, mean_q: 69.459763
 576958/700000: episode: 1684, duration: 0.593s, episode steps: 118, steps per second: 199, episode rew

 584931/700000: episode: 1709, duration: 0.947s, episode steps: 187, steps per second: 198, episode reward: -24.210, mean reward: -0.129 [-100.000, 14.290], mean action: 1.642 [0.000, 3.000], mean observation: -0.079 [-0.795, 1.403], loss: 8.108272, mean_absolute_error: 52.415012, mean_q: 69.878616
 585272/700000: episode: 1710, duration: 1.757s, episode steps: 341, steps per second: 194, episode reward: 172.960, mean reward: 0.507 [-20.264, 100.000], mean action: 0.845 [0.000, 3.000], mean observation: 0.149 [-1.012, 1.000], loss: 9.483685, mean_absolute_error: 52.402058, mean_q: 70.073586
 585531/700000: episode: 1711, duration: 1.318s, episode steps: 259, steps per second: 196, episode reward: 248.169, mean reward: 0.958 [-9.950, 100.000], mean action: 1.189 [0.000, 3.000], mean observation: 0.015 [-0.740, 1.000], loss: 14.938274, mean_absolute_error: 52.464634, mean_q: 70.003868
 585847/700000: episode: 1712, duration: 1.619s, episode steps: 316, steps per second: 195, episode rewa

 593513/700000: episode: 1737, duration: 1.256s, episode steps: 223, steps per second: 178, episode reward: 224.345, mean reward: 1.006 [-9.623, 100.000], mean action: 1.507 [0.000, 3.000], mean observation: 0.042 [-0.946, 1.000], loss: 10.911820, mean_absolute_error: 51.824001, mean_q: 69.335587
 593841/700000: episode: 1738, duration: 1.710s, episode steps: 328, steps per second: 192, episode reward: 218.788, mean reward: 0.667 [-3.264, 100.000], mean action: 1.482 [0.000, 3.000], mean observation: 0.082 [-0.744, 1.000], loss: 5.607190, mean_absolute_error: 52.035740, mean_q: 69.725548
 594083/700000: episode: 1739, duration: 1.251s, episode steps: 242, steps per second: 193, episode reward: 224.108, mean reward: 0.926 [-8.809, 100.000], mean action: 1.434 [0.000, 3.000], mean observation: 0.090 [-1.002, 1.000], loss: 8.003345, mean_absolute_error: 52.377926, mean_q: 70.167633
 594284/700000: episode: 1740, duration: 1.000s, episode steps: 201, steps per second: 201, episode reward: 

 601369/700000: episode: 1765, duration: 1.942s, episode steps: 378, steps per second: 195, episode reward: 174.316, mean reward: 0.461 [-13.605, 100.000], mean action: 1.593 [0.000, 3.000], mean observation: 0.153 [-0.873, 1.000], loss: 6.613391, mean_absolute_error: 52.103329, mean_q: 69.581017
 601633/700000: episode: 1766, duration: 1.344s, episode steps: 264, steps per second: 196, episode reward: 247.007, mean reward: 0.936 [-10.957, 100.000], mean action: 1.144 [0.000, 3.000], mean observation: 0.127 [-0.752, 1.000], loss: 6.861195, mean_absolute_error: 51.592369, mean_q: 68.825745
 601907/700000: episode: 1767, duration: 1.373s, episode steps: 274, steps per second: 200, episode reward: 217.822, mean reward: 0.795 [-3.146, 100.000], mean action: 0.920 [0.000, 3.000], mean observation: 0.126 [-0.937, 1.000], loss: 13.971601, mean_absolute_error: 51.371922, mean_q: 68.737839
 602292/700000: episode: 1768, duration: 2.024s, episode steps: 385, steps per second: 190, episode reward

 609663/700000: episode: 1793, duration: 2.049s, episode steps: 400, steps per second: 195, episode reward: 183.211, mean reward: 0.458 [-10.865, 100.000], mean action: 1.093 [0.000, 3.000], mean observation: 0.180 [-0.971, 1.000], loss: 6.431015, mean_absolute_error: 53.035713, mean_q: 71.010773
 609891/700000: episode: 1794, duration: 1.167s, episode steps: 228, steps per second: 195, episode reward: 248.350, mean reward: 1.089 [-12.392, 100.000], mean action: 1.263 [0.000, 3.000], mean observation: 0.118 [-0.727, 1.185], loss: 5.310159, mean_absolute_error: 52.703323, mean_q: 70.680923
 610102/700000: episode: 1795, duration: 1.065s, episode steps: 211, steps per second: 198, episode reward: 234.032, mean reward: 1.109 [-9.523, 100.000], mean action: 1.370 [0.000, 3.000], mean observation: 0.084 [-0.754, 1.000], loss: 7.689984, mean_absolute_error: 52.675346, mean_q: 70.471359
 610297/700000: episode: 1796, duration: 0.984s, episode steps: 195, steps per second: 198, episode reward:

 618030/700000: episode: 1821, duration: 0.635s, episode steps: 126, steps per second: 198, episode reward: 10.099, mean reward: 0.080 [-100.000, 11.038], mean action: 1.897 [0.000, 3.000], mean observation: 0.055 [-0.788, 1.353], loss: 4.661095, mean_absolute_error: 53.050598, mean_q: 70.900673
 618292/700000: episode: 1822, duration: 1.346s, episode steps: 262, steps per second: 195, episode reward: 192.312, mean reward: 0.734 [-3.399, 100.000], mean action: 1.069 [0.000, 3.000], mean observation: 0.163 [-1.177, 1.000], loss: 11.065434, mean_absolute_error: 53.196468, mean_q: 70.923340
 618447/700000: episode: 1823, duration: 0.774s, episode steps: 155, steps per second: 200, episode reward: 218.051, mean reward: 1.407 [-10.966, 100.000], mean action: 1.271 [0.000, 3.000], mean observation: 0.079 [-1.071, 1.000], loss: 10.546539, mean_absolute_error: 53.023159, mean_q: 70.926437
 618858/700000: episode: 1824, duration: 2.138s, episode steps: 411, steps per second: 192, episode reward

 628003/700000: episode: 1849, duration: 1.978s, episode steps: 385, steps per second: 195, episode reward: 211.781, mean reward: 0.550 [-19.702, 100.000], mean action: 1.636 [0.000, 3.000], mean observation: 0.095 [-0.723, 1.123], loss: 8.246916, mean_absolute_error: 52.691593, mean_q: 70.586174
 628110/700000: episode: 1850, duration: 0.539s, episode steps: 107, steps per second: 199, episode reward: -26.804, mean reward: -0.251 [-100.000, 14.988], mean action: 1.514 [0.000, 3.000], mean observation: -0.011 [-1.619, 1.000], loss: 19.540464, mean_absolute_error: 52.820564, mean_q: 70.702263
 628267/700000: episode: 1851, duration: 0.796s, episode steps: 157, steps per second: 197, episode reward: 13.430, mean reward: 0.086 [-100.000, 32.689], mean action: 1.854 [0.000, 3.000], mean observation: 0.053 [-0.862, 1.229], loss: 6.778589, mean_absolute_error: 52.801849, mean_q: 70.853371
 628539/700000: episode: 1852, duration: 1.373s, episode steps: 272, steps per second: 198, episode rewa

 637432/700000: episode: 1877, duration: 0.892s, episode steps: 178, steps per second: 199, episode reward: 224.235, mean reward: 1.260 [-11.779, 100.000], mean action: 1.067 [0.000, 3.000], mean observation: 0.075 [-1.026, 1.209], loss: 7.325198, mean_absolute_error: 52.279057, mean_q: 70.097847
 637646/700000: episode: 1878, duration: 1.089s, episode steps: 214, steps per second: 196, episode reward: 204.412, mean reward: 0.955 [-9.805, 100.000], mean action: 1.579 [0.000, 3.000], mean observation: 0.020 [-0.784, 1.000], loss: 11.639611, mean_absolute_error: 52.392578, mean_q: 70.042496
 637870/700000: episode: 1879, duration: 1.149s, episode steps: 224, steps per second: 195, episode reward: 234.044, mean reward: 1.045 [-17.381, 100.000], mean action: 1.156 [0.000, 3.000], mean observation: 0.111 [-0.990, 1.098], loss: 11.046592, mean_absolute_error: 52.117664, mean_q: 69.799431
 638040/700000: episode: 1880, duration: 0.852s, episode steps: 170, steps per second: 199, episode rewar

 644666/700000: episode: 1905, duration: 0.981s, episode steps: 193, steps per second: 197, episode reward: 184.159, mean reward: 0.954 [-9.542, 100.000], mean action: 1.280 [0.000, 3.000], mean observation: 0.053 [-0.867, 1.000], loss: 11.254116, mean_absolute_error: 52.677807, mean_q: 70.625618
 644913/700000: episode: 1906, duration: 1.251s, episode steps: 247, steps per second: 197, episode reward: 212.966, mean reward: 0.862 [-17.333, 100.000], mean action: 1.283 [0.000, 3.000], mean observation: 0.111 [-1.065, 1.000], loss: 7.600597, mean_absolute_error: 52.630241, mean_q: 70.609993
 645314/700000: episode: 1907, duration: 2.098s, episode steps: 401, steps per second: 191, episode reward: 211.065, mean reward: 0.526 [-19.467, 100.000], mean action: 0.820 [0.000, 3.000], mean observation: 0.169 [-0.906, 1.000], loss: 13.655714, mean_absolute_error: 52.425171, mean_q: 70.186035
 645510/700000: episode: 1908, duration: 0.983s, episode steps: 196, steps per second: 199, episode rewar

 652386/700000: episode: 1933, duration: 1.102s, episode steps: 219, steps per second: 199, episode reward: 207.249, mean reward: 0.946 [-18.454, 100.000], mean action: 1.205 [0.000, 3.000], mean observation: 0.107 [-0.815, 1.000], loss: 8.541140, mean_absolute_error: 53.160049, mean_q: 71.187187
 652627/700000: episode: 1934, duration: 1.414s, episode steps: 241, steps per second: 170, episode reward: 201.033, mean reward: 0.834 [-9.039, 100.000], mean action: 1.178 [0.000, 3.000], mean observation: 0.086 [-0.827, 1.000], loss: 6.795444, mean_absolute_error: 53.693745, mean_q: 72.208473
 652912/700000: episode: 1935, duration: 1.477s, episode steps: 285, steps per second: 193, episode reward: 224.560, mean reward: 0.788 [-17.352, 100.000], mean action: 1.502 [0.000, 3.000], mean observation: 0.102 [-0.671, 1.000], loss: 9.466476, mean_absolute_error: 53.104847, mean_q: 71.344833
 653082/700000: episode: 1936, duration: 0.854s, episode steps: 170, steps per second: 199, episode reward:

 660004/700000: episode: 1961, duration: 1.442s, episode steps: 286, steps per second: 198, episode reward: 247.865, mean reward: 0.867 [-17.553, 100.000], mean action: 1.318 [0.000, 3.000], mean observation: 0.114 [-0.718, 1.012], loss: 8.406827, mean_absolute_error: 54.147575, mean_q: 72.527893
 660203/700000: episode: 1962, duration: 1.002s, episode steps: 199, steps per second: 199, episode reward: 244.520, mean reward: 1.229 [-11.447, 100.000], mean action: 1.482 [0.000, 3.000], mean observation: 0.085 [-0.681, 1.000], loss: 12.867167, mean_absolute_error: 54.154610, mean_q: 72.535881
 660390/700000: episode: 1963, duration: 0.936s, episode steps: 187, steps per second: 200, episode reward: 224.952, mean reward: 1.203 [-2.758, 100.000], mean action: 1.332 [0.000, 3.000], mean observation: 0.076 [-0.687, 1.000], loss: 7.044367, mean_absolute_error: 54.725246, mean_q: 73.115944
 660588/700000: episode: 1964, duration: 1.001s, episode steps: 198, steps per second: 198, episode reward

 667282/700000: episode: 1989, duration: 0.498s, episode steps: 98, steps per second: 197, episode reward: -386.587, mean reward: -3.945 [-100.000, 3.861], mean action: 1.847 [0.000, 3.000], mean observation: 0.265 [-1.250, 2.041], loss: 14.901793, mean_absolute_error: 54.011082, mean_q: 72.691727
 667642/700000: episode: 1990, duration: 1.889s, episode steps: 360, steps per second: 191, episode reward: 127.680, mean reward: 0.355 [-19.198, 100.000], mean action: 1.969 [0.000, 3.000], mean observation: 0.107 [-0.953, 1.000], loss: 8.343477, mean_absolute_error: 54.625210, mean_q: 73.458961
 667924/700000: episode: 1991, duration: 1.425s, episode steps: 282, steps per second: 198, episode reward: 228.470, mean reward: 0.810 [-17.553, 100.000], mean action: 0.908 [0.000, 3.000], mean observation: 0.132 [-0.890, 1.000], loss: 9.638046, mean_absolute_error: 54.674210, mean_q: 73.673439
 668632/700000: episode: 1992, duration: 3.765s, episode steps: 708, steps per second: 188, episode rewar

 676582/700000: episode: 2017, duration: 1.736s, episode steps: 338, steps per second: 195, episode reward: 261.582, mean reward: 0.774 [-19.309, 100.000], mean action: 0.953 [0.000, 3.000], mean observation: 0.197 [-0.806, 1.403], loss: 9.192776, mean_absolute_error: 53.371613, mean_q: 71.859665
 676803/700000: episode: 2018, duration: 1.118s, episode steps: 221, steps per second: 198, episode reward: 245.270, mean reward: 1.110 [-9.631, 100.000], mean action: 1.235 [0.000, 3.000], mean observation: 0.037 [-0.693, 1.000], loss: 4.828215, mean_absolute_error: 52.881248, mean_q: 71.157234
 677287/700000: episode: 2019, duration: 2.578s, episode steps: 484, steps per second: 188, episode reward: 226.408, mean reward: 0.468 [-18.591, 100.000], mean action: 0.632 [0.000, 3.000], mean observation: 0.205 [-0.798, 1.402], loss: 8.735344, mean_absolute_error: 53.278934, mean_q: 71.571373
 677672/700000: episode: 2020, duration: 2.579s, episode steps: 385, steps per second: 149, episode reward:

 684918/700000: episode: 2045, duration: 1.838s, episode steps: 358, steps per second: 195, episode reward: 239.112, mean reward: 0.668 [-19.902, 100.000], mean action: 0.804 [0.000, 3.000], mean observation: 0.112 [-1.141, 1.016], loss: 9.238003, mean_absolute_error: 53.025730, mean_q: 71.176796
 685097/700000: episode: 2046, duration: 0.897s, episode steps: 179, steps per second: 200, episode reward: 196.349, mean reward: 1.097 [-2.799, 100.000], mean action: 0.939 [0.000, 3.000], mean observation: 0.091 [-1.097, 1.000], loss: 4.562304, mean_absolute_error: 52.828571, mean_q: 71.061874
 685350/700000: episode: 2047, duration: 1.294s, episode steps: 253, steps per second: 196, episode reward: 237.142, mean reward: 0.937 [-8.667, 100.000], mean action: 1.237 [0.000, 3.000], mean observation: -0.017 [-1.138, 1.000], loss: 13.281186, mean_absolute_error: 52.938900, mean_q: 71.149811
 685832/700000: episode: 2048, duration: 2.514s, episode steps: 482, steps per second: 192, episode reward

 692445/700000: episode: 2073, duration: 2.661s, episode steps: 518, steps per second: 195, episode reward: 219.884, mean reward: 0.424 [-17.945, 100.000], mean action: 0.620 [0.000, 3.000], mean observation: 0.184 [-0.967, 1.019], loss: 6.426335, mean_absolute_error: 53.451675, mean_q: 71.844070
 692670/700000: episode: 2074, duration: 1.137s, episode steps: 225, steps per second: 198, episode reward: 217.784, mean reward: 0.968 [-17.530, 100.000], mean action: 1.040 [0.000, 3.000], mean observation: 0.128 [-0.964, 1.000], loss: 6.040617, mean_absolute_error: 53.745808, mean_q: 72.366768
 693022/700000: episode: 2075, duration: 1.837s, episode steps: 352, steps per second: 192, episode reward: 254.057, mean reward: 0.722 [-18.543, 100.000], mean action: 0.966 [0.000, 3.000], mean observation: 0.167 [-1.018, 1.000], loss: 7.500653, mean_absolute_error: 54.037308, mean_q: 72.461327
 694022/700000: episode: 2076, duration: 5.582s, episode steps: 1000, steps per second: 179, episode rewar

Now that we have trained our model and saved the weights to file, we can perform our evaluation experiment. We believe a three part evaluation experiment would suitably evaluate how effective the trained model is:
1. Visual evalution: Whilst more of a rough evaluation, a visual inspection of the model can still be useful to get an idea of its performance. We will visualize the model playing Lunar Lander 10 times and make observations on its performance.
2. Reward evaluation: Reward is a good measure of the models performance. As specified on OpenAIs github page (https://github.com/openai/gym/wiki/Leaderboard#lunarlander-v2), reward for Lunar Lander is gained by moving towards the landing area and landing successfully, and reward is lost by moving away from the landing area and crashing. We will run our model over 200 episodes and note its mean reward. This should be a good indication of whether our model is performing well or not.
3. Solved Episodes evaluation: OpenAI define the game as "solved" when the reward is 200. We will count how many solved episodes occur during the 200 episodes.


Firstly, we load the weights saved previously (this is not needed if the previous cell has been run, however it means that we only need to run the previous cell once, and then can run the following cell without needing to recalculate the weights)

In [19]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=500000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# After training is done, we save the final weights.
dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_6 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_21 (Dense)             (None, 128)               1152      
_________________________________________________________________
activation_21 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_22 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 128)               16512     
__________________________________

### Evaluation 1 - Visual

We simply run the model on 10 episodes and observe the results

In [16]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 226.454, steps: 204
Episode 2: reward: 200.664, steps: 200
Episode 3: reward: 229.733, steps: 307
Episode 4: reward: 233.379, steps: 253
Episode 5: reward: 179.293, steps: 427
Episode 6: reward: 215.350, steps: 229
Episode 7: reward: 243.887, steps: 222
Episode 8: reward: 203.602, steps: 212
Episode 9: reward: 228.328, steps: 191
Episode 10: reward: 192.017, steps: 428


<keras.callbacks.History at 0x12bd52eb8>

Observations:

* The model performed very well, as reflected in its excellent scores
* The spacecraft never once crashed
* Having played the game ourselves, we believe that the model performed better than we ever could, and had inhuman precision.

### Evaluation 2 - Reward

We run the model on 200 episodes and store the resulting rewards in a pandas dataframe

In [20]:
history = dqn.test(env, nb_episodes=200, visualize=False, verbose=0)
results_df = pd.DataFrame({'Reward': history.history['episode_reward']})
results_df.head(10)

Unnamed: 0,Reward
0,226.453966
1,200.664476
2,229.732994
3,233.379147
4,179.292717
5,215.349685
6,243.886568
7,203.602363
8,228.328149
9,192.017400


In [21]:
results_df['Reward'].mean()

214.51801258860186

As we can see from the dataframe containing the episode rewards, as well as the mean of all rewards, the model performs very well on almost all episodes. There are, however, a few episodes in which it performs worse than expected (i.e. below 100).

### Evaluation 3 - Solved Episodes

* All episodes with a reward of over 200 count as solved episodes

In [13]:
results_df.loc[results_df['Reward'] > 200].count()

Reward    172
dtype: int64

In [14]:
results_df.loc[results_df['Reward'] < 0].count()

Reward    0
dtype: int64

We can see above that 172 out of the 200 episodes were "solved" (86% of all episodes), while not a single episode got a reward of below 0. OpenAI defines the game of LunarLander "solved" when an average reward of 200 or over is gotten over 100 episodes. We can therefore say that we "solved" LunarLander.

# Conclusions

1. From visually observing the trained model playing Lunar Lander, we believe that it appears to be better than a good human at playing the game. With more training, we believe it would appear to be better than any human expert at the game.
2. The mean reward over the 200 episodes was very high (over 200)
3. Our model "solved" the game 86% of the time. We believe that more training episodes would allow it to solve the game even more often.