In [1]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
import tensorflow as tf

import rlcard
from rlcard.agents.dqn_agent import DQNAgent
from rlcard.utils.utils import set_global_seed
from rlcard.utils.logger import Logger

# Make environment
env = rlcard.make('blackjack')
eval_env = rlcard.make('blackjack')

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
save_plot_every = 1000
evaluate_num = 10000
episode_num = 1000000

# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 100
norm_step = 100

# The paths for saving the logs and learning curves
root_path = './experiments/blackjack_dqn_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[10,10])
    env.set_agents([agent])
    eval_env.set_agents([agent])

    sess.run(tf.global_variables_initializer())

    # Count the number of steps
    step_counter = 0

    # Init a Logger to plot the learning curve
    logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on Blackjack', log_path=log_path, csv_path=csv_path)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train
        for ts in trajectories[0]:
            agent.feed(ts)
            step_counter += 1

            # Train the agent
            if step_counter > memory_init_size + norm_step:
                loss = agent.train()
                print('\rINFO - Step {}, loss: {}'.format(step_counter, loss), end='')

        # Evaluate the performance
        if episode % evaluate_every == 0:
            reward = 0
            for eval_episode in range(evaluate_num):
                _, payoffs = eval_env.run(is_training=False)
                reward += payoffs[0]

            logger.log('\n########## Evaluation ##########')
            logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward)/evaluate_num))

            # Add point to logger
            logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)

        # Make plot
        if episode % save_plot_every == 0 and episode > 0:
            logger.make_plot(save_path=figure_path+str(episode)+'.png')

    # Make the final plot
    logger.make_plot(save_path=figure_path+'final_'+str(episode)+'.png')


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please switch to tf.train.get_global_step

########## Evaluation ##########
Timestep: 1 Average reward is -0.2654

########## Evaluation ##########
Timestep: 136 Average reward is -0.6396


INFO - Copied model parameters to target network.
INFO - Step 271, loss: 0.6868118643760681
########## Evaluation ##########
Timestep: 271 Average reward is -0.6076
INFO - Step 402, loss: 0.8737920522689819
########## Evaluation ##########
Timestep: 402 Average reward is -0.5777
I

INFO - Step 6122, loss: 0.47987440228462224
########## Evaluation ##########
Timestep: 6122 Average reward is -0.0696
INFO - Step 6200, loss: 0.54132503271102963
INFO - Copied model parameters to target network.
INFO - Step 6261, loss: 0.47613316774368286
########## Evaluation ##########
Timestep: 6261 Average reward is -0.0741
INFO - Step 6393, loss: 0.46913129091262823
########## Evaluation ##########
Timestep: 6393 Average reward is -0.0855
INFO - Step 6526, loss: 0.57319259643554693
########## Evaluation ##########
Timestep: 6526 Average reward is -0.0616
INFO - Step 6669, loss: 0.53154265880584723
########## Evaluation ##########
Timestep: 6669 Average reward is -0.0828
INFO - Step 6799, loss: 0.36658540368080145
########## Evaluation ##########
Timestep: 6799 Average reward is -0.0707
INFO - Step 6921, loss: 0.62187141180038456
########## Evaluation ##########
Timestep: 6921 Average reward is -0.0701
INFO - Step 7060, loss: 0.55895626544952395
########## Evaluation ##########
Tim

INFO - Step 14758, loss: 0.51538145542144786
########## Evaluation ##########
Timestep: 14758 Average reward is -0.0645
INFO - Step 14892, loss: 0.48918437957763675
########## Evaluation ##########
Timestep: 14892 Average reward is -0.072
INFO - Step 15038, loss: 0.49480485916137695
########## Evaluation ##########
Timestep: 15038 Average reward is -0.0728
INFO - Step 15179, loss: 0.40406340360641483
########## Evaluation ##########
Timestep: 15179 Average reward is -0.0736
INFO - Step 15200, loss: 0.58654731512069716
INFO - Copied model parameters to target network.
INFO - Step 15329, loss: 0.60085463523864753
########## Evaluation ##########
Timestep: 15329 Average reward is -0.0633
INFO - Step 15476, loss: 0.50157791376113896
########## Evaluation ##########
Timestep: 15476 Average reward is -0.0667
INFO - Step 15622, loss: 0.73205351829528817
########## Evaluation ##########
Timestep: 15622 Average reward is -0.0807
INFO - Step 15774, loss: 0.48099109530448914
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 29737, loss: 0.39924341440200806
########## Evaluation ##########
Timestep: 29737 Average reward is -0.0901
INFO - Step 29887, loss: 0.61211824417114265
########## Evaluation ##########
Timestep: 29887 Average reward is -0.0601
INFO - Step 30027, loss: 0.54617989063262946
########## Evaluation ##########
Timestep: 30027 Average reward is -0.0738
INFO - Step 30162, loss: 0.50404405593872073
########## Evaluation ##########
Timestep: 30162 Average reward is -0.081
INFO - Step 30200, loss: 0.45176714658737187
INFO - Copied model parameters to target network.
INFO - Step 30305, loss: 0.61537456512451173
########## Evaluation ##########
Timestep: 30305 Average reward is -0.0839
INFO - Step 30457, loss: 0.40182745456695557
########## Evaluation ##########
Timestep: 30457 Average reward is -0.0554
INFO - Step 30610, loss: 0.45347458124160767
########## Evaluation ##########
Timestep: 30610 Average reward is -0.0766
INFO - Step 30753, loss: 0.32254457473754883
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 31200, loss: 0.53237211704254153
INFO - Copied model parameters to target network.
INFO - Step 31214, loss: 0.37848860025405884
########## Evaluation ##########
Timestep: 31214 Average reward is -0.0648
INFO - Step 31363, loss: 0.62139081954956056
########## Evaluation ##########
Timestep: 31363 Average reward is -0.0643
INFO - Step 31512, loss: 0.64104110002517754
########## Evaluation ##########
Timestep: 31512 Average reward is -0.0626
INFO - Step 31657, loss: 0.51619052886962897
########## Evaluation ##########
Timestep: 31657 Average reward is -0.0667
INFO - Step 31800, loss: 0.54419606924057014
########## Evaluation ##########
Timestep: 31800 Average reward is -0.0851
INFO - Step 31940, loss: 0.85574197769165046
########## Evaluation ##########
Timestep: 31940 Average reward is -0.0724
INFO - Step 32084, loss: 0.58445584774017335
########## Evaluation ##########
Timestep: 32084 Average reward is -0.071
INFO - Step 32200, loss: 0.52224397659301764
INFO - Copied model p

  fig, ax = plt.subplots()


INFO - Step 32692, loss: 0.45442640781402594
########## Evaluation ##########
Timestep: 32692 Average reward is -0.0717
INFO - Step 32831, loss: 0.83650630712509163
########## Evaluation ##########
Timestep: 32831 Average reward is -0.0884
INFO - Step 32974, loss: 0.42782503366470337
########## Evaluation ##########
Timestep: 32974 Average reward is -0.055
INFO - Step 33122, loss: 0.57120454311370856
########## Evaluation ##########
Timestep: 33122 Average reward is -0.0651
INFO - Step 33200, loss: 0.59919673204422854
INFO - Copied model parameters to target network.
INFO - Step 33278, loss: 0.57561528682708745
########## Evaluation ##########
Timestep: 33278 Average reward is -0.0665
INFO - Step 33432, loss: 0.53216552734375336
########## Evaluation ##########
Timestep: 33432 Average reward is -0.0531
INFO - Step 33588, loss: 0.67718791961669926
########## Evaluation ##########
Timestep: 33588 Average reward is -0.0769
INFO - Step 33736, loss: 0.47817635536193855
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 34165, loss: 0.67277073860168463
########## Evaluation ##########
Timestep: 34165 Average reward is -0.0718
INFO - Step 34200, loss: 0.57087355852127086
INFO - Copied model parameters to target network.
INFO - Step 34315, loss: 0.49557781219482427
########## Evaluation ##########
Timestep: 34315 Average reward is -0.068
INFO - Step 34467, loss: 0.32243880629539496
########## Evaluation ##########
Timestep: 34467 Average reward is -0.0489
INFO - Step 34615, loss: 0.45913153886795044
########## Evaluation ##########
Timestep: 34615 Average reward is -0.0633
INFO - Step 34766, loss: 0.46434372663497925
########## Evaluation ##########
Timestep: 34766 Average reward is -0.0632
INFO - Step 34918, loss: 0.41724789142608647
########## Evaluation ##########
Timestep: 34918 Average reward is -0.0556
INFO - Step 35066, loss: 0.46598148345947266
########## Evaluation ##########
Timestep: 35066 Average reward is -0.0683
INFO - Step 35200, loss: 0.51527041196823125
INFO - Copied model p

  fig, ax = plt.subplots()


INFO - Step 35694, loss: 0.51074838638305666
########## Evaluation ##########
Timestep: 35694 Average reward is -0.0862
INFO - Step 35849, loss: 0.52979099750518874
########## Evaluation ##########
Timestep: 35849 Average reward is -0.0554
INFO - Step 35993, loss: 0.70528107881546024
########## Evaluation ##########
Timestep: 35993 Average reward is -0.0861
INFO - Step 36139, loss: 0.38691934943199166
########## Evaluation ##########
Timestep: 36139 Average reward is -0.0794
INFO - Step 36200, loss: 0.41941392421722415
INFO - Copied model parameters to target network.
INFO - Step 36281, loss: 0.36578825116157534
########## Evaluation ##########
Timestep: 36281 Average reward is -0.0743
INFO - Step 36435, loss: 0.40710750222206116
########## Evaluation ##########
Timestep: 36435 Average reward is -0.0752
INFO - Step 36588, loss: 0.52710109949111945
########## Evaluation ##########
Timestep: 36588 Average reward is -0.0679
INFO - Step 36749, loss: 0.46179252862930324
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 37200, loss: 0.51358270645141653
INFO - Copied model parameters to target network.
INFO - Step 37202, loss: 0.49736011028289795
########## Evaluation ##########
Timestep: 37202 Average reward is -0.0572
INFO - Step 37356, loss: 0.61005085706710824
########## Evaluation ##########
Timestep: 37356 Average reward is -0.065
INFO - Step 37518, loss: 0.47030049562454224
########## Evaluation ##########
Timestep: 37518 Average reward is -0.0735
INFO - Step 37680, loss: 0.43668019771575933
########## Evaluation ##########
Timestep: 37680 Average reward is -0.0739
INFO - Step 37822, loss: 0.57733607292175293
########## Evaluation ##########
Timestep: 37822 Average reward is -0.0881
INFO - Step 37971, loss: 0.60991132259368903
########## Evaluation ##########
Timestep: 37971 Average reward is -0.0733
INFO - Step 38113, loss: 0.56148946285247825
########## Evaluation ##########
Timestep: 38113 Average reward is -0.0796
INFO - Step 38200, loss: 0.44805687665939337
INFO - Copied model p

  fig, ax = plt.subplots()


INFO - Step 38701, loss: 0.61737501621246345
########## Evaluation ##########
Timestep: 38701 Average reward is -0.0683
INFO - Step 38845, loss: 0.52108782529830934
########## Evaluation ##########
Timestep: 38845 Average reward is -0.0761
INFO - Step 38997, loss: 0.50418734550476075
########## Evaluation ##########
Timestep: 38997 Average reward is -0.0747
INFO - Step 39159, loss: 0.35631418228149414
########## Evaluation ##########
Timestep: 39159 Average reward is -0.073
INFO - Step 39200, loss: 0.33775323629379274
INFO - Copied model parameters to target network.
INFO - Step 39302, loss: 0.50175571441650394
########## Evaluation ##########
Timestep: 39302 Average reward is -0.0721
INFO - Step 39457, loss: 0.67837738990783696
########## Evaluation ##########
Timestep: 39457 Average reward is -0.0768
INFO - Step 39613, loss: 0.56199514865875246
########## Evaluation ##########
Timestep: 39613 Average reward is -0.0781
INFO - Step 39770, loss: 0.31935530900955276
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 40200, loss: 0.68344533443450933
INFO - Copied model parameters to target network.
INFO - Step 40245, loss: 0.59689295291900635
########## Evaluation ##########
Timestep: 40245 Average reward is -0.0675
INFO - Step 40406, loss: 0.65015482902526864
########## Evaluation ##########
Timestep: 40406 Average reward is -0.0777
INFO - Step 40559, loss: 0.57537007331848143
########## Evaluation ##########
Timestep: 40559 Average reward is -0.0686
INFO - Step 40721, loss: 0.61839246749877937
########## Evaluation ##########
Timestep: 40721 Average reward is -0.0745
INFO - Step 40882, loss: 0.57593649625778215
########## Evaluation ##########
Timestep: 40882 Average reward is -0.0824
INFO - Step 41045, loss: 0.54354250431060793
########## Evaluation ##########
Timestep: 41045 Average reward is -0.0655
INFO - Step 41197, loss: 0.57647901773452767
########## Evaluation ##########
Timestep: 41197 Average reward is -0.0687
INFO - Step 41200, loss: 0.75384527444839487
INFO - Copied model 

  fig, ax = plt.subplots()


INFO - Step 41810, loss: 0.54612791538238534
########## Evaluation ##########
Timestep: 41810 Average reward is -0.0575
INFO - Step 41968, loss: 0.79375988245010385
########## Evaluation ##########
Timestep: 41968 Average reward is -0.0729
INFO - Step 42120, loss: 0.45555317401885986
########## Evaluation ##########
Timestep: 42120 Average reward is -0.0637
INFO - Step 42200, loss: 0.51154047250747687
INFO - Copied model parameters to target network.
INFO - Step 42266, loss: 0.41631448268890387
########## Evaluation ##########
Timestep: 42266 Average reward is -0.0844
INFO - Step 42419, loss: 0.43850654363632215
########## Evaluation ##########
Timestep: 42419 Average reward is -0.0694
INFO - Step 42566, loss: 0.60010790824890143
########## Evaluation ##########
Timestep: 42566 Average reward is -0.0813
INFO - Step 42708, loss: 0.40317225456237793
########## Evaluation ##########
Timestep: 42708 Average reward is -0.072
INFO - Step 42872, loss: 0.46815827488899236
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 43320, loss: 0.65227222442626957
########## Evaluation ##########
Timestep: 43320 Average reward is -0.0656
INFO - Step 43479, loss: 0.48463618755340576
########## Evaluation ##########
Timestep: 43479 Average reward is -0.0714
INFO - Step 43630, loss: 0.52927422523498543
########## Evaluation ##########
Timestep: 43630 Average reward is -0.0887
INFO - Step 43770, loss: 0.48171326518058777
########## Evaluation ##########
Timestep: 43770 Average reward is -0.0723
INFO - Step 43918, loss: 0.35033148527145386
########## Evaluation ##########
Timestep: 43918 Average reward is -0.0812
INFO - Step 44063, loss: 0.62593376636505133
########## Evaluation ##########
Timestep: 44063 Average reward is -0.0848
INFO - Step 44200, loss: 0.69362461566925056
INFO - Copied model parameters to target network.
INFO - Step 44216, loss: 0.34486937522888184
########## Evaluation ##########
Timestep: 44216 Average reward is -0.0888
INFO - Step 44363, loss: 0.47208201885223394
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 44797, loss: 0.42218354344367983
########## Evaluation ##########
Timestep: 44797 Average reward is -0.0745
INFO - Step 44947, loss: 0.61738461256027224
########## Evaluation ##########
Timestep: 44947 Average reward is -0.0651
INFO - Step 45112, loss: 0.51112604141235354
########## Evaluation ##########
Timestep: 45112 Average reward is -0.0713
INFO - Step 45200, loss: 0.53731113672256473
INFO - Copied model parameters to target network.
INFO - Step 45265, loss: 0.62564212083816537
########## Evaluation ##########
Timestep: 45265 Average reward is -0.0678
INFO - Step 45410, loss: 0.46670442819595337
########## Evaluation ##########
Timestep: 45410 Average reward is -0.0867
INFO - Step 45558, loss: 0.55996501445770263
########## Evaluation ##########
Timestep: 45558 Average reward is -0.0848
INFO - Step 45713, loss: 0.51534605026245126
########## Evaluation ##########
Timestep: 45713 Average reward is -0.0803
INFO - Step 45882, loss: 0.62662965059280485
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 46318, loss: 0.71933114528656016
########## Evaluation ##########
Timestep: 46318 Average reward is -0.0619
INFO - Step 46458, loss: 0.56147873401641856
########## Evaluation ##########
Timestep: 46458 Average reward is -0.0754
INFO - Step 46616, loss: 0.48810493946075446
########## Evaluation ##########
Timestep: 46616 Average reward is -0.09
INFO - Step 46781, loss: 0.42096412181854255
########## Evaluation ##########
Timestep: 46781 Average reward is -0.0765
INFO - Step 46929, loss: 0.31620338559150696
########## Evaluation ##########
Timestep: 46929 Average reward is -0.0628
INFO - Step 47078, loss: 0.54018604755401615
########## Evaluation ##########
Timestep: 47078 Average reward is -0.0853
INFO - Step 47200, loss: 0.53655982017517095
INFO - Copied model parameters to target network.
INFO - Step 47233, loss: 0.52449154853820827
########## Evaluation ##########
Timestep: 47233 Average reward is -0.0746
INFO - Step 47381, loss: 0.57123839855194096
########## Evaluation 

  fig, ax = plt.subplots()



########## Evaluation ##########
Timestep: 47671 Average reward is -0.0556
INFO - Step 47822, loss: 0.70582014322280885
########## Evaluation ##########
Timestep: 47822 Average reward is -0.0678
INFO - Step 47963, loss: 0.48740687966346743
########## Evaluation ##########
Timestep: 47963 Average reward is -0.0762
INFO - Step 48119, loss: 0.60123550891876225
########## Evaluation ##########
Timestep: 48119 Average reward is -0.0731
INFO - Step 48200, loss: 0.52684819698333743
INFO - Copied model parameters to target network.
INFO - Step 48273, loss: 0.44803404808044434
########## Evaluation ##########
Timestep: 48273 Average reward is -0.0661
INFO - Step 48427, loss: 0.68514865636825565
########## Evaluation ##########
Timestep: 48427 Average reward is -0.082
INFO - Step 48576, loss: 0.49073106050491333
########## Evaluation ##########
Timestep: 48576 Average reward is -0.0686
INFO - Step 48735, loss: 0.58656877279281627
########## Evaluation ##########
Timestep: 48735 Average reward i

  fig, ax = plt.subplots()


INFO - Step 49346, loss: 0.47545579075813293
########## Evaluation ##########
Timestep: 49346 Average reward is -0.0547
INFO - Step 49506, loss: 0.52098608016967774
########## Evaluation ##########
Timestep: 49506 Average reward is -0.082
INFO - Step 49680, loss: 0.41028088331222534
########## Evaluation ##########
Timestep: 49680 Average reward is -0.0871
INFO - Step 49828, loss: 0.45583245158195496
########## Evaluation ##########
Timestep: 49828 Average reward is -0.0655
INFO - Step 49983, loss: 0.45487830042839056
########## Evaluation ##########
Timestep: 49983 Average reward is -0.0767
INFO - Step 50125, loss: 0.55446201562881473
########## Evaluation ##########
Timestep: 50125 Average reward is -0.076
INFO - Step 50200, loss: 0.45906475186347967
INFO - Copied model parameters to target network.
INFO - Step 50284, loss: 0.50547218322753914
########## Evaluation ##########
Timestep: 50284 Average reward is -0.0664
INFO - Step 50441, loss: 0.41249433159828186
########## Evaluation 

  fig, ax = plt.subplots()


INFO - Step 50933, loss: 0.38909596204757696
########## Evaluation ##########
Timestep: 50933 Average reward is -0.0644
INFO - Step 51090, loss: 0.48691427707672125
########## Evaluation ##########
Timestep: 51090 Average reward is -0.0513
INFO - Step 51200, loss: 0.45956331491470337
INFO - Copied model parameters to target network.
INFO - Step 51252, loss: 0.47554129362106323
########## Evaluation ##########
Timestep: 51252 Average reward is -0.0703
INFO - Step 51407, loss: 0.44139015674591064
########## Evaluation ##########
Timestep: 51407 Average reward is -0.0912
INFO - Step 51564, loss: 0.63308340311050425
########## Evaluation ##########
Timestep: 51564 Average reward is -0.0873
INFO - Step 51716, loss: 0.59666162729263313
########## Evaluation ##########
Timestep: 51716 Average reward is -0.0687
INFO - Step 51874, loss: 0.37962824106216433
########## Evaluation ##########
Timestep: 51874 Average reward is -0.0963
INFO - Step 52029, loss: 0.56662356853485116
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 52483, loss: 0.42803514003753664
########## Evaluation ##########
Timestep: 52483 Average reward is -0.064
INFO - Step 52642, loss: 0.46423882246017456
########## Evaluation ##########
Timestep: 52642 Average reward is -0.0755
INFO - Step 52788, loss: 0.57255589962005626
########## Evaluation ##########
Timestep: 52788 Average reward is -0.0803
INFO - Step 52947, loss: 0.59641176462173464
########## Evaluation ##########
Timestep: 52947 Average reward is -0.0785
INFO - Step 53088, loss: 0.41326314210891724
########## Evaluation ##########
Timestep: 53088 Average reward is -0.0822
INFO - Step 53200, loss: 0.49909603595733643
INFO - Copied model parameters to target network.
INFO - Step 53243, loss: 0.64551883935928347
########## Evaluation ##########
Timestep: 53243 Average reward is -0.0796
INFO - Step 53384, loss: 0.44991201162338257
########## Evaluation ##########
Timestep: 53384 Average reward is -0.0738
INFO - Step 53545, loss: 0.47619229555130005
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 54017, loss: 0.65050077438354496
########## Evaluation ##########
Timestep: 54017 Average reward is -0.0554
INFO - Step 54167, loss: 0.33909809589385986
########## Evaluation ##########
Timestep: 54167 Average reward is -0.0736
INFO - Step 54200, loss: 0.40866822004318247
INFO - Copied model parameters to target network.
INFO - Step 54316, loss: 0.49826192855834966
########## Evaluation ##########
Timestep: 54316 Average reward is -0.0745
INFO - Step 54490, loss: 0.57759845256805423
########## Evaluation ##########
Timestep: 54490 Average reward is -0.0811
INFO - Step 54639, loss: 0.37913012504577637
########## Evaluation ##########
Timestep: 54639 Average reward is -0.0759
INFO - Step 54802, loss: 0.43950706720352173
########## Evaluation ##########
Timestep: 54802 Average reward is -0.0872
INFO - Step 54947, loss: 0.53630435466766363
########## Evaluation ##########
Timestep: 54947 Average reward is -0.0669
INFO - Step 55099, loss: 0.41539391875267037
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 55550, loss: 0.47420614957809456
########## Evaluation ##########
Timestep: 55550 Average reward is -0.076
INFO - Step 55712, loss: 0.35920304059982377
########## Evaluation ##########
Timestep: 55712 Average reward is -0.0755
INFO - Step 55853, loss: 0.54141664505004885
########## Evaluation ##########
Timestep: 55853 Average reward is -0.0787
INFO - Step 56018, loss: 0.44307869672775275
########## Evaluation ##########
Timestep: 56018 Average reward is -0.0746
INFO - Step 56173, loss: 0.52665084600448614
########## Evaluation ##########
Timestep: 56173 Average reward is -0.0767
INFO - Step 56200, loss: 0.50100111961364757
INFO - Copied model parameters to target network.
INFO - Step 56327, loss: 0.43206003308296204
########## Evaluation ##########
Timestep: 56327 Average reward is -0.0931
INFO - Step 56485, loss: 0.36028045415878296
########## Evaluation ##########
Timestep: 56485 Average reward is -0.0724
INFO - Step 56648, loss: 0.38745051622390747
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 57101, loss: 0.53190112113952643
########## Evaluation ##########
Timestep: 57101 Average reward is -0.0767
INFO - Step 57200, loss: 0.51963466405868534
INFO - Copied model parameters to target network.
INFO - Step 57258, loss: 0.53008919954299933
########## Evaluation ##########
Timestep: 57258 Average reward is -0.0624
INFO - Step 57409, loss: 0.43970298767089844
########## Evaluation ##########
Timestep: 57409 Average reward is -0.072
INFO - Step 57568, loss: 0.58224898576736453
########## Evaluation ##########
Timestep: 57568 Average reward is -0.0659
INFO - Step 57728, loss: 0.53704839944839485
########## Evaluation ##########
Timestep: 57728 Average reward is -0.0597
INFO - Step 57884, loss: 0.53368282318115236
########## Evaluation ##########
Timestep: 57884 Average reward is -0.0683
INFO - Step 58037, loss: 0.42325466871261597
########## Evaluation ##########
Timestep: 58037 Average reward is -0.0727
INFO - Step 58191, loss: 0.44610711932182313
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 58668, loss: 0.44015106558799744
########## Evaluation ##########
Timestep: 58668 Average reward is -0.0771
INFO - Step 58818, loss: 0.47718429565429694
########## Evaluation ##########
Timestep: 58818 Average reward is -0.0701
INFO - Step 58962, loss: 0.50922501087188727
########## Evaluation ##########
Timestep: 58962 Average reward is -0.0629
INFO - Step 59111, loss: 0.56874006986618046
########## Evaluation ##########
Timestep: 59111 Average reward is -0.0604
INFO - Step 59200, loss: 0.50728344917297365
INFO - Copied model parameters to target network.
INFO - Step 59270, loss: 0.54465258121490485
########## Evaluation ##########
Timestep: 59270 Average reward is -0.0748
INFO - Step 59431, loss: 0.45368400216102687
########## Evaluation ##########
Timestep: 59431 Average reward is -0.065
INFO - Step 59579, loss: 0.46290487051010136
########## Evaluation ##########
Timestep: 59579 Average reward is -0.0681
INFO - Step 59729, loss: 0.58386331796646126
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 60200, loss: 0.75787067413330085
INFO - Copied model parameters to target network.
INFO - Step 60206, loss: 0.3064471185207367
########## Evaluation ##########
Timestep: 60206 Average reward is -0.0927
INFO - Step 60357, loss: 0.40485149621963557
########## Evaluation ##########
Timestep: 60357 Average reward is -0.0752
INFO - Step 60510, loss: 0.29247671365737915
########## Evaluation ##########
Timestep: 60510 Average reward is -0.0945
INFO - Step 60667, loss: 0.48497736454010017
########## Evaluation ##########
Timestep: 60667 Average reward is -0.0657
INFO - Step 60818, loss: 0.55542135238647467
########## Evaluation ##########
Timestep: 60818 Average reward is -0.067
INFO - Step 60979, loss: 0.56259810924530034
########## Evaluation ##########
Timestep: 60979 Average reward is -0.0739
INFO - Step 61145, loss: 0.38661205768585205
########## Evaluation ##########
Timestep: 61145 Average reward is -0.0704
INFO - Step 61200, loss: 0.43417724967002874
INFO - Copied model pa

  fig, ax = plt.subplots()


INFO - Step 61782, loss: 0.47345459461212163
########## Evaluation ##########
Timestep: 61782 Average reward is -0.0864
INFO - Step 61930, loss: 0.53247463703155526
########## Evaluation ##########
Timestep: 61930 Average reward is -0.0696
INFO - Step 62087, loss: 0.55963563919067384
########## Evaluation ##########
Timestep: 62087 Average reward is -0.0812
INFO - Step 62200, loss: 0.45922699570655824
INFO - Copied model parameters to target network.
INFO - Step 62237, loss: 0.60313481092453535
########## Evaluation ##########
Timestep: 62237 Average reward is -0.0704
INFO - Step 62407, loss: 0.66523820161819466
########## Evaluation ##########
Timestep: 62407 Average reward is -0.0487
INFO - Step 62559, loss: 0.40160217881202793
########## Evaluation ##########
Timestep: 62559 Average reward is -0.0705
INFO - Step 62713, loss: 0.47055840492248535
########## Evaluation ##########
Timestep: 62713 Average reward is -0.0852
INFO - Step 62865, loss: 0.42218029499053955
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 63200, loss: 0.49465543031692505
INFO - Copied model parameters to target network.
INFO - Step 63314, loss: 0.55354529619216926
########## Evaluation ##########
Timestep: 63314 Average reward is -0.0586
INFO - Step 63467, loss: 0.56601554155349736
########## Evaluation ##########
Timestep: 63467 Average reward is -0.082
INFO - Step 63619, loss: 0.42693373560905457
########## Evaluation ##########
Timestep: 63619 Average reward is -0.0831
INFO - Step 63776, loss: 0.60481995344161997
########## Evaluation ##########
Timestep: 63776 Average reward is -0.0776
INFO - Step 63945, loss: 0.83554708957672124
########## Evaluation ##########
Timestep: 63945 Average reward is -0.0663
INFO - Step 64090, loss: 0.43757027387619026
########## Evaluation ##########
Timestep: 64090 Average reward is -0.0749
INFO - Step 64200, loss: 0.40073668956756596
INFO - Copied model parameters to target network.
INFO - Step 64246, loss: 0.48361694812774665
########## Evaluation ##########
Timestep: 642

  fig, ax = plt.subplots()


INFO - Step 64885, loss: 0.30784177780151367
########## Evaluation ##########
Timestep: 64885 Average reward is -0.0585
INFO - Step 65031, loss: 0.43238979578018196
########## Evaluation ##########
Timestep: 65031 Average reward is -0.0743
INFO - Step 65187, loss: 0.52871149778366094
########## Evaluation ##########
Timestep: 65187 Average reward is -0.0517
INFO - Step 65200, loss: 0.60495269298553473
INFO - Copied model parameters to target network.
INFO - Step 65341, loss: 0.58081173896789555
########## Evaluation ##########
Timestep: 65341 Average reward is -0.0654
INFO - Step 65483, loss: 0.52499711513519293
########## Evaluation ##########
Timestep: 65483 Average reward is -0.0822
INFO - Step 65637, loss: 0.47436010837554933
########## Evaluation ##########
Timestep: 65637 Average reward is -0.0752
INFO - Step 65798, loss: 0.54660093784332284
########## Evaluation ##########
Timestep: 65798 Average reward is -0.0646
INFO - Step 65949, loss: 0.31455582380294837
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 66406, loss: 0.62977164983749397
########## Evaluation ##########
Timestep: 66406 Average reward is -0.0708
INFO - Step 66551, loss: 0.50515490770339976
########## Evaluation ##########
Timestep: 66551 Average reward is -0.0715
INFO - Step 66690, loss: 0.40353655815124515
########## Evaluation ##########
Timestep: 66690 Average reward is -0.0793
INFO - Step 66842, loss: 0.47500500082969666
########## Evaluation ##########
Timestep: 66842 Average reward is -0.0683
INFO - Step 66997, loss: 0.41104179620742835
########## Evaluation ##########
Timestep: 66997 Average reward is -0.051
INFO - Step 67155, loss: 0.56698167324066164
########## Evaluation ##########
Timestep: 67155 Average reward is -0.0591
INFO - Step 67200, loss: 0.46541625261306765
INFO - Copied model parameters to target network.
INFO - Step 67318, loss: 0.40734261274337775
########## Evaluation ##########
Timestep: 67318 Average reward is -0.0677
INFO - Step 67468, loss: 0.45678368210792544
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 67927, loss: 0.64913702011108474
########## Evaluation ##########
Timestep: 67927 Average reward is -0.0686
INFO - Step 68088, loss: 0.40895625948905945
########## Evaluation ##########
Timestep: 68088 Average reward is -0.064
INFO - Step 68200, loss: 0.55513209104537966
INFO - Copied model parameters to target network.
INFO - Step 68248, loss: 0.35845118761062626
########## Evaluation ##########
Timestep: 68248 Average reward is -0.0664
INFO - Step 68405, loss: 0.43231099843978884
########## Evaluation ##########
Timestep: 68405 Average reward is -0.0798
INFO - Step 68548, loss: 0.47371941804885864
########## Evaluation ##########
Timestep: 68548 Average reward is -0.0632
INFO - Step 68704, loss: 0.46396651864051826
########## Evaluation ##########
Timestep: 68704 Average reward is -0.085
INFO - Step 68867, loss: 0.45777744054794317
########## Evaluation ##########
Timestep: 68867 Average reward is -0.0508
INFO - Step 69036, loss: 0.41163825988769535
########## Evaluation 

  fig, ax = plt.subplots()


INFO - Step 69491, loss: 0.53026843070983896
########## Evaluation ##########
Timestep: 69491 Average reward is -0.0773
INFO - Step 69649, loss: 0.57983267307281496
########## Evaluation ##########
Timestep: 69649 Average reward is -0.0603
INFO - Step 69807, loss: 0.52336061000823976
########## Evaluation ##########
Timestep: 69807 Average reward is -0.0609
INFO - Step 69958, loss: 0.45117539167404175
########## Evaluation ##########
Timestep: 69958 Average reward is -0.0723
INFO - Step 70117, loss: 0.60269856452941956
########## Evaluation ##########
Timestep: 70117 Average reward is -0.0486
INFO - Step 70200, loss: 0.64851200580596926
INFO - Copied model parameters to target network.
INFO - Step 70272, loss: 0.63841968774795536
########## Evaluation ##########
Timestep: 70272 Average reward is -0.0454
INFO - Step 70420, loss: 0.52659690380096443
########## Evaluation ##########
Timestep: 70420 Average reward is -0.066
INFO - Step 70587, loss: 0.55693542957305915
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 71045, loss: 0.54890668392181424
########## Evaluation ##########
Timestep: 71045 Average reward is -0.0781
INFO - Step 71193, loss: 0.43616813421249397
########## Evaluation ##########
Timestep: 71193 Average reward is -0.0633
INFO - Step 71200, loss: 0.53606724739074715
INFO - Copied model parameters to target network.
INFO - Step 71352, loss: 0.44255805015563965
########## Evaluation ##########
Timestep: 71352 Average reward is -0.0643
INFO - Step 71509, loss: 0.49652987718582153
########## Evaluation ##########
Timestep: 71509 Average reward is -0.0564
INFO - Step 71666, loss: 0.48906353116035464
########## Evaluation ##########
Timestep: 71666 Average reward is -0.0625
INFO - Step 71818, loss: 0.58443927764892583
########## Evaluation ##########
Timestep: 71818 Average reward is -0.0659
INFO - Step 71975, loss: 0.42648535966873177
########## Evaluation ##########
Timestep: 71975 Average reward is -0.0644
INFO - Step 72127, loss: 0.46798053383827215
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 72589, loss: 0.58621323108673134
########## Evaluation ##########
Timestep: 72589 Average reward is -0.0617
INFO - Step 72738, loss: 0.58287680149078376
########## Evaluation ##########
Timestep: 72738 Average reward is -0.0696
INFO - Step 72902, loss: 0.45365327596664434
########## Evaluation ##########
Timestep: 72902 Average reward is -0.063
INFO - Step 73040, loss: 0.55458271503448493
########## Evaluation ##########
Timestep: 73040 Average reward is -0.0676
INFO - Step 73200, loss: 0.55158638954162616
INFO - Copied model parameters to target network.
INFO - Step 73206, loss: 0.40522861480712895
########## Evaluation ##########
Timestep: 73206 Average reward is -0.0819
INFO - Step 73359, loss: 0.63661950826644955
########## Evaluation ##########
Timestep: 73359 Average reward is -0.0626
INFO - Step 73501, loss: 0.51846098899841315
########## Evaluation ##########
Timestep: 73501 Average reward is -0.0631
INFO - Step 73656, loss: 0.50694036483764653
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 74119, loss: 0.50637137889862064
########## Evaluation ##########
Timestep: 74119 Average reward is -0.0604
INFO - Step 74200, loss: 0.39041590690612793
INFO - Copied model parameters to target network.
INFO - Step 74257, loss: 0.41646993160247845
########## Evaluation ##########
Timestep: 74257 Average reward is -0.0621
INFO - Step 74419, loss: 0.35439485311508186
########## Evaluation ##########
Timestep: 74419 Average reward is -0.061
INFO - Step 74580, loss: 0.56191104650497445
########## Evaluation ##########
Timestep: 74580 Average reward is -0.0575
INFO - Step 74725, loss: 0.57701694965362557
########## Evaluation ##########
Timestep: 74725 Average reward is -0.063
INFO - Step 74873, loss: 0.52950263023376465
########## Evaluation ##########
Timestep: 74873 Average reward is -0.0665
INFO - Step 75020, loss: 0.55392187833786015
########## Evaluation ##########
Timestep: 75020 Average reward is -0.0601
INFO - Step 75181, loss: 0.49302610754966736
########## Evaluation 

  fig, ax = plt.subplots()


INFO - Step 75641, loss: 0.69654941558837893
########## Evaluation ##########
Timestep: 75641 Average reward is -0.0653
INFO - Step 75801, loss: 0.58188569545745856
########## Evaluation ##########
Timestep: 75801 Average reward is -0.0577
INFO - Step 75949, loss: 0.47878539562225346
########## Evaluation ##########
Timestep: 75949 Average reward is -0.0683
INFO - Step 76114, loss: 0.47921508550643927
########## Evaluation ##########
Timestep: 76114 Average reward is -0.0785
INFO - Step 76200, loss: 0.48422998189926157
INFO - Copied model parameters to target network.
INFO - Step 76264, loss: 0.51012593507766725
########## Evaluation ##########
Timestep: 76264 Average reward is -0.0616
INFO - Step 76409, loss: 0.49580091238021856
########## Evaluation ##########
Timestep: 76409 Average reward is -0.0531
INFO - Step 76560, loss: 0.28515833616256714
########## Evaluation ##########
Timestep: 76560 Average reward is -0.0635
INFO - Step 76725, loss: 0.53621709346771244
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 77193, loss: 0.43464851379394534
########## Evaluation ##########
Timestep: 77193 Average reward is -0.0652
INFO - Step 77200, loss: 0.55804193019866946
INFO - Copied model parameters to target network.
INFO - Step 77340, loss: 0.57760548591613774
########## Evaluation ##########
Timestep: 77340 Average reward is -0.072
INFO - Step 77491, loss: 0.49040150642395026
########## Evaluation ##########
Timestep: 77491 Average reward is -0.054
INFO - Step 77642, loss: 0.69123804569244385
########## Evaluation ##########
Timestep: 77642 Average reward is -0.0802
INFO - Step 77784, loss: 0.33961454033851624
########## Evaluation ##########
Timestep: 77784 Average reward is -0.0654
INFO - Step 77941, loss: 0.42905211448669434
########## Evaluation ##########
Timestep: 77941 Average reward is -0.0619
INFO - Step 78091, loss: 0.53051519393920937
########## Evaluation ##########
Timestep: 78091 Average reward is -0.0532
INFO - Step 78200, loss: 0.48724848031997684
INFO - Copied model pa

  fig, ax = plt.subplots()


INFO - Step 78683, loss: 0.71927702426910476
########## Evaluation ##########
Timestep: 78683 Average reward is -0.0643
INFO - Step 78834, loss: 0.58005046844482427
########## Evaluation ##########
Timestep: 78834 Average reward is -0.0614
INFO - Step 78980, loss: 0.58266580104827884
########## Evaluation ##########
Timestep: 78980 Average reward is -0.0662
INFO - Step 79132, loss: 0.49236622452735934
########## Evaluation ##########
Timestep: 79132 Average reward is -0.0511
INFO - Step 79200, loss: 0.56763291358947757
INFO - Copied model parameters to target network.
INFO - Step 79285, loss: 0.43288943171501165
########## Evaluation ##########
Timestep: 79285 Average reward is -0.0767
INFO - Step 79438, loss: 0.51267898082733155
########## Evaluation ##########
Timestep: 79438 Average reward is -0.0664
INFO - Step 79605, loss: 0.26425683498382576
########## Evaluation ##########
Timestep: 79605 Average reward is -0.0564
INFO - Step 79767, loss: 0.40540564060211183
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 80200, loss: 0.39794278144836426
INFO - Copied model parameters to target network.
INFO - Step 80221, loss: 0.41682136058807373
########## Evaluation ##########
Timestep: 80221 Average reward is -0.0395
INFO - Step 80371, loss: 0.33524119853973394
########## Evaluation ##########
Timestep: 80371 Average reward is -0.0668
INFO - Step 80515, loss: 0.50624406337738043
########## Evaluation ##########
Timestep: 80515 Average reward is -0.0555
INFO - Step 80670, loss: 0.54013496637344366
########## Evaluation ##########
Timestep: 80670 Average reward is -0.0628
INFO - Step 80827, loss: 0.66880929470062267
########## Evaluation ##########
Timestep: 80827 Average reward is -0.0708
INFO - Step 80981, loss: 0.28581961989402776
########## Evaluation ##########
Timestep: 80981 Average reward is -0.0555
INFO - Step 81121, loss: 0.46450299024581913
########## Evaluation ##########
Timestep: 81121 Average reward is -0.0535
INFO - Step 81200, loss: 0.50226926803588873
INFO - Copied model 

  fig, ax = plt.subplots()


INFO - Step 81705, loss: 0.48607933521270756
########## Evaluation ##########
Timestep: 81705 Average reward is -0.0794
INFO - Step 81859, loss: 0.56200039386749274
########## Evaluation ##########
Timestep: 81859 Average reward is -0.0604
INFO - Step 82011, loss: 0.39098989963531494
########## Evaluation ##########
Timestep: 82011 Average reward is -0.0605
INFO - Step 82156, loss: 0.52569282054901124
########## Evaluation ##########
Timestep: 82156 Average reward is -0.0651
INFO - Step 82200, loss: 0.43222352862358093
INFO - Copied model parameters to target network.
INFO - Step 82333, loss: 0.42065834999084474
########## Evaluation ##########
Timestep: 82333 Average reward is -0.0746
INFO - Step 82474, loss: 0.30332586169242863
########## Evaluation ##########
Timestep: 82474 Average reward is -0.0606
INFO - Step 82628, loss: 0.37467432022094727
########## Evaluation ##########
Timestep: 82628 Average reward is -0.0605
INFO - Step 82779, loss: 0.58087992668151866
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 83200, loss: 0.51887458562850955
INFO - Copied model parameters to target network.
INFO - Step 83226, loss: 0.51239293813705443
########## Evaluation ##########
Timestep: 83226 Average reward is -0.0685
INFO - Step 83387, loss: 0.44383835792541504
########## Evaluation ##########
Timestep: 83387 Average reward is -0.0661
INFO - Step 83550, loss: 0.41342419385910034
########## Evaluation ##########
Timestep: 83550 Average reward is -0.0632
INFO - Step 83715, loss: 0.32969671487808236
########## Evaluation ##########
Timestep: 83715 Average reward is -0.0551
INFO - Step 83870, loss: 0.54911303520202645
########## Evaluation ##########
Timestep: 83870 Average reward is -0.06
INFO - Step 84022, loss: 0.59840381145477373
########## Evaluation ##########
Timestep: 84022 Average reward is -0.0655
INFO - Step 84166, loss: 0.39526742696762085
########## Evaluation ##########
Timestep: 84166 Average reward is -0.0605
INFO - Step 84200, loss: 0.49681580066680917
INFO - Copied model pa

  fig, ax = plt.subplots()


INFO - Step 84770, loss: 0.55067074298858644
########## Evaluation ##########
Timestep: 84770 Average reward is -0.0648
INFO - Step 84929, loss: 0.28390440344810486
########## Evaluation ##########
Timestep: 84929 Average reward is -0.0733
INFO - Step 85085, loss: 0.34538221359252935
########## Evaluation ##########
Timestep: 85085 Average reward is -0.0645
INFO - Step 85200, loss: 0.49645400047302246
INFO - Copied model parameters to target network.
INFO - Step 85246, loss: 0.49633875489234924
########## Evaluation ##########
Timestep: 85246 Average reward is -0.057
INFO - Step 85407, loss: 0.52779984474182135
########## Evaluation ##########
Timestep: 85407 Average reward is -0.0593
INFO - Step 85565, loss: 0.39939704537391663
########## Evaluation ##########
Timestep: 85565 Average reward is -0.0631
INFO - Step 85716, loss: 0.53914821147918726
########## Evaluation ##########
Timestep: 85716 Average reward is -0.0709
INFO - Step 85868, loss: 0.51809674501419077
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 86200, loss: 0.59056913852691657
INFO - Copied model parameters to target network.
INFO - Step 86308, loss: 0.62272703647613533
########## Evaluation ##########
Timestep: 86308 Average reward is -0.0729
INFO - Step 86454, loss: 0.48763698339462287
########## Evaluation ##########
Timestep: 86454 Average reward is -0.0484
INFO - Step 86611, loss: 0.34255349636077886
########## Evaluation ##########
Timestep: 86611 Average reward is -0.069
INFO - Step 86762, loss: 0.42316347360610966
########## Evaluation ##########
Timestep: 86762 Average reward is -0.0657
INFO - Step 86917, loss: 0.50572657585144043
########## Evaluation ##########
Timestep: 86917 Average reward is -0.0575
INFO - Step 87085, loss: 0.51798546314239595
########## Evaluation ##########
Timestep: 87085 Average reward is -0.0733
INFO - Step 87200, loss: 0.48549062013626157
INFO - Copied model parameters to target network.
INFO - Step 87231, loss: 0.43946015834808354
########## Evaluation ##########
Timestep: 872

  fig, ax = plt.subplots()


INFO - Step 87831, loss: 0.31310042738914497
########## Evaluation ##########
Timestep: 87831 Average reward is -0.0682
INFO - Step 87982, loss: 0.38433846831321716
########## Evaluation ##########
Timestep: 87982 Average reward is -0.0755
INFO - Step 88132, loss: 0.42378208041191134
########## Evaluation ##########
Timestep: 88132 Average reward is -0.0525
INFO - Step 88200, loss: 0.50453436374664316
INFO - Copied model parameters to target network.
INFO - Step 88292, loss: 0.57295560836791997
########## Evaluation ##########
Timestep: 88292 Average reward is -0.063
INFO - Step 88437, loss: 0.40682575106620793
########## Evaluation ##########
Timestep: 88437 Average reward is -0.0523
INFO - Step 88589, loss: 0.37865206599235535
########## Evaluation ##########
Timestep: 88589 Average reward is -0.0633
INFO - Step 88739, loss: 0.74836087226867686
########## Evaluation ##########
Timestep: 88739 Average reward is -0.0542
INFO - Step 88886, loss: 0.38599017262458876
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 89359, loss: 0.53978639841079716
########## Evaluation ##########
Timestep: 89359 Average reward is -0.0697
INFO - Step 89508, loss: 0.55918371677398687
########## Evaluation ##########
Timestep: 89508 Average reward is -0.068
INFO - Step 89669, loss: 0.55986434221267736
########## Evaluation ##########
Timestep: 89669 Average reward is -0.0612
INFO - Step 89827, loss: 0.41316789388656616
########## Evaluation ##########
Timestep: 89827 Average reward is -0.0708
INFO - Step 89983, loss: 0.48120242357254037
########## Evaluation ##########
Timestep: 89983 Average reward is -0.0544
INFO - Step 90124, loss: 0.42863786220550537
########## Evaluation ##########
Timestep: 90124 Average reward is -0.0719
INFO - Step 90200, loss: 0.39806210994720463
INFO - Copied model parameters to target network.
INFO - Step 90269, loss: 0.39977893233299255
########## Evaluation ##########
Timestep: 90269 Average reward is -0.0544
INFO - Step 90415, loss: 0.56101977825164854
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 90864, loss: 0.41780167818069465
########## Evaluation ##########
Timestep: 90864 Average reward is -0.0445
INFO - Step 91007, loss: 0.52893304824829163
########## Evaluation ##########
Timestep: 91007 Average reward is -0.0712
INFO - Step 91147, loss: 0.29306530952453613
########## Evaluation ##########
Timestep: 91147 Average reward is -0.0592
INFO - Step 91200, loss: 0.49501892924308777
INFO - Copied model parameters to target network.
INFO - Step 91296, loss: 0.44720214605331427
########## Evaluation ##########
Timestep: 91296 Average reward is -0.0536
INFO - Step 91438, loss: 0.44284182786941536
########## Evaluation ##########
Timestep: 91438 Average reward is -0.0658
INFO - Step 91584, loss: 0.41030311584472656
########## Evaluation ##########
Timestep: 91584 Average reward is -0.0551
INFO - Step 91738, loss: 0.59477245807647723
########## Evaluation ##########
Timestep: 91738 Average reward is -0.0479
INFO - Step 91892, loss: 0.38130551576614383
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 92361, loss: 0.49807739257812525
########## Evaluation ##########
Timestep: 92361 Average reward is -0.0732
INFO - Step 92517, loss: 0.55878168344497685
########## Evaluation ##########
Timestep: 92517 Average reward is -0.0641
INFO - Step 92672, loss: 0.71982353925704965
########## Evaluation ##########
Timestep: 92672 Average reward is -0.0865
INFO - Step 92826, loss: 0.63611471652984624
########## Evaluation ##########
Timestep: 92826 Average reward is -0.0594
INFO - Step 92969, loss: 0.55108910799026496
########## Evaluation ##########
Timestep: 92969 Average reward is -0.0713
INFO - Step 93130, loss: 0.69838333129882815
########## Evaluation ##########
Timestep: 93130 Average reward is -0.0764
INFO - Step 93200, loss: 0.53027266263961796
INFO - Copied model parameters to target network.
INFO - Step 93286, loss: 0.48835915327072144
########## Evaluation ##########
Timestep: 93286 Average reward is -0.071
INFO - Step 93444, loss: 0.43024227023124695
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 93912, loss: 0.36953112483024597
########## Evaluation ##########
Timestep: 93912 Average reward is -0.0715
INFO - Step 94069, loss: 0.49530112743377686
########## Evaluation ##########
Timestep: 94069 Average reward is -0.0749
INFO - Step 94200, loss: 0.43220525979995736
INFO - Copied model parameters to target network.
INFO - Step 94220, loss: 0.63826262950897225
########## Evaluation ##########
Timestep: 94220 Average reward is -0.0786
INFO - Step 94375, loss: 0.48552858829498297
########## Evaluation ##########
Timestep: 94375 Average reward is -0.0659
INFO - Step 94529, loss: 0.45444628596305847
########## Evaluation ##########
Timestep: 94529 Average reward is -0.0793
INFO - Step 94678, loss: 0.43033501505851746
########## Evaluation ##########
Timestep: 94678 Average reward is -0.0621
INFO - Step 94832, loss: 0.39325350522994995
########## Evaluation ##########
Timestep: 94832 Average reward is -0.0724
INFO - Step 94982, loss: 0.37634688615798955
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 95442, loss: 0.42690998315811163
########## Evaluation ##########
Timestep: 95442 Average reward is -0.0653
INFO - Step 95594, loss: 0.43023529648780825
########## Evaluation ##########
Timestep: 95594 Average reward is -0.0503
INFO - Step 95751, loss: 0.42527011036872864
########## Evaluation ##########
Timestep: 95751 Average reward is -0.0725
INFO - Step 95908, loss: 0.52351570129394537
########## Evaluation ##########
Timestep: 95908 Average reward is -0.0581
INFO - Step 96067, loss: 0.59701281785964973
########## Evaluation ##########
Timestep: 96067 Average reward is -0.0792
INFO - Step 96200, loss: 0.44005766510963444
INFO - Copied model parameters to target network.
INFO - Step 96208, loss: 0.35331162810325624
########## Evaluation ##########
Timestep: 96208 Average reward is -0.0532
INFO - Step 96343, loss: 0.27983814477920537
########## Evaluation ##########
Timestep: 96343 Average reward is -0.0599
INFO - Step 96494, loss: 0.61692047119140625
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 96942, loss: 0.49266147613525397
########## Evaluation ##########
Timestep: 96942 Average reward is -0.0627
INFO - Step 97105, loss: 0.49486690759658813
########## Evaluation ##########
Timestep: 97105 Average reward is -0.0568
INFO - Step 97200, loss: 0.71657067537307743
INFO - Copied model parameters to target network.
INFO - Step 97241, loss: 0.37820780277252296
########## Evaluation ##########
Timestep: 97241 Average reward is -0.0672
INFO - Step 97389, loss: 0.61773484945297245
########## Evaluation ##########
Timestep: 97389 Average reward is -0.0591
INFO - Step 97531, loss: 0.45162174105644226
########## Evaluation ##########
Timestep: 97531 Average reward is -0.0588
INFO - Step 97682, loss: 0.42552331089973454
########## Evaluation ##########
Timestep: 97682 Average reward is -0.0787
INFO - Step 97845, loss: 0.47078275680541996
########## Evaluation ##########
Timestep: 97845 Average reward is -0.0704
INFO - Step 98000, loss: 0.44425547122955326
########## Evaluatio

  fig, ax = plt.subplots()


INFO - Step 98447, loss: 0.50544238090515145
########## Evaluation ##########
Timestep: 98447 Average reward is -0.0617
INFO - Step 98597, loss: 0.71602845191955576
########## Evaluation ##########
Timestep: 98597 Average reward is -0.0632
INFO - Step 98737, loss: 0.65194821357727057
########## Evaluation ##########
Timestep: 98737 Average reward is -0.0876
INFO - Step 98888, loss: 0.47258692979812625
########## Evaluation ##########
Timestep: 98888 Average reward is -0.0654
INFO - Step 99047, loss: 0.61627292633056644
########## Evaluation ##########
Timestep: 99047 Average reward is -0.0556
INFO - Step 99195, loss: 0.67595940828323365
########## Evaluation ##########
Timestep: 99195 Average reward is -0.0614
INFO - Step 99200, loss: 0.41375872492790225
INFO - Copied model parameters to target network.
INFO - Step 99348, loss: 0.44663399457931526
########## Evaluation ##########
Timestep: 99348 Average reward is -0.075
INFO - Step 99502, loss: 0.57343888282775884
########## Evaluation

  fig, ax = plt.subplots()


INFO - Step 99981, loss: 0.62409949302673343
########## Evaluation ##########
Timestep: 99981 Average reward is -0.059
INFO - Step 100128, loss: 0.67097961902618414
########## Evaluation ##########
Timestep: 100128 Average reward is -0.0674
INFO - Step 100200, loss: 0.49552902579307556
INFO - Copied model parameters to target network.
INFO - Step 100290, loss: 0.48025894165039064
########## Evaluation ##########
Timestep: 100290 Average reward is -0.0569
INFO - Step 100447, loss: 0.56907486915588383
########## Evaluation ##########
Timestep: 100447 Average reward is -0.055
INFO - Step 100595, loss: 0.37417906522750854
########## Evaluation ##########
Timestep: 100595 Average reward is -0.0763
INFO - Step 100736, loss: 0.54786968231201173
########## Evaluation ##########
Timestep: 100736 Average reward is -0.0626
INFO - Step 100881, loss: 0.50215309858322144
########## Evaluation ##########
Timestep: 100881 Average reward is -0.0776
INFO - Step 101036, loss: 0.42896711826324463
########

  fig, ax = plt.subplots()


INFO - Step 101506, loss: 0.46422761678695685
########## Evaluation ##########
Timestep: 101506 Average reward is -0.0573
INFO - Step 101654, loss: 0.56588143110275276
########## Evaluation ##########
Timestep: 101654 Average reward is -0.0587
INFO - Step 101819, loss: 0.58403384685516365
########## Evaluation ##########
Timestep: 101819 Average reward is -0.0511
INFO - Step 101969, loss: 0.43220713734626776
########## Evaluation ##########
Timestep: 101969 Average reward is -0.065
INFO - Step 102118, loss: 0.35144466161727905
########## Evaluation ##########
Timestep: 102118 Average reward is -0.0773
INFO - Step 102200, loss: 0.53107476234436044
INFO - Copied model parameters to target network.
INFO - Step 102276, loss: 0.46764886379241943
########## Evaluation ##########
Timestep: 102276 Average reward is -0.0535
INFO - Step 102430, loss: 0.58862870931625373
########## Evaluation ##########
Timestep: 102430 Average reward is -0.0493
INFO - Step 102582, loss: 0.48066776990890503
#####

  fig, ax = plt.subplots()


INFO - Step 103021, loss: 0.70034885406494147
########## Evaluation ##########
Timestep: 103021 Average reward is -0.0711
INFO - Step 103178, loss: 0.50178349018096925
########## Evaluation ##########
Timestep: 103178 Average reward is -0.0582
INFO - Step 103200, loss: 0.61462664604187017
INFO - Copied model parameters to target network.
INFO - Step 103332, loss: 0.68516898155212456
########## Evaluation ##########
Timestep: 103332 Average reward is -0.0617
INFO - Step 103472, loss: 0.51884555816650396
########## Evaluation ##########
Timestep: 103472 Average reward is -0.0672
INFO - Step 103617, loss: 0.38815921545028687
########## Evaluation ##########
Timestep: 103617 Average reward is -0.054
INFO - Step 103770, loss: 0.59827518463134773
########## Evaluation ##########
Timestep: 103770 Average reward is -0.0664
INFO - Step 103909, loss: 0.33277896046638494
########## Evaluation ##########
Timestep: 103909 Average reward is -0.0534
INFO - Step 104047, loss: 0.47663056850433356
#####

  fig, ax = plt.subplots()


INFO - Step 104506, loss: 0.50473719835281376
########## Evaluation ##########
Timestep: 104506 Average reward is -0.0729
INFO - Step 104654, loss: 0.32093226909637456
########## Evaluation ##########
Timestep: 104654 Average reward is -0.0783
INFO - Step 104800, loss: 0.54288589954376224
########## Evaluation ##########
Timestep: 104800 Average reward is -0.0741
INFO - Step 104949, loss: 0.48433017730712894
########## Evaluation ##########
Timestep: 104949 Average reward is -0.0584
INFO - Step 105111, loss: 0.42743065953254786
########## Evaluation ##########
Timestep: 105111 Average reward is -0.0741
INFO - Step 105200, loss: 0.52029615640640266
INFO - Copied model parameters to target network.
INFO - Step 105258, loss: 0.52769267559051514
########## Evaluation ##########
Timestep: 105258 Average reward is -0.065
INFO - Step 105404, loss: 0.53209179639816286
########## Evaluation ##########
Timestep: 105404 Average reward is -0.0747
INFO - Step 105564, loss: 0.63457393646240236
#####

  fig, ax = plt.subplots()


INFO - Step 106042, loss: 0.43478333950042725
########## Evaluation ##########
Timestep: 106042 Average reward is -0.0613
INFO - Step 106195, loss: 0.35938060283660896
########## Evaluation ##########
Timestep: 106195 Average reward is -0.0743
INFO - Step 106200, loss: 0.5744437575340271
INFO - Copied model parameters to target network.
INFO - Step 106360, loss: 0.51105719804763796
########## Evaluation ##########
Timestep: 106360 Average reward is -0.062
INFO - Step 106512, loss: 0.51896047592163095
########## Evaluation ##########
Timestep: 106512 Average reward is -0.0718
INFO - Step 106662, loss: 0.44852042198181153
########## Evaluation ##########
Timestep: 106662 Average reward is -0.093
INFO - Step 106829, loss: 0.55909973382949834
########## Evaluation ##########
Timestep: 106829 Average reward is -0.0749
INFO - Step 106968, loss: 0.23371995985507965
########## Evaluation ##########
Timestep: 106968 Average reward is -0.0559
INFO - Step 107136, loss: 0.57939159870147766
#######

  fig, ax = plt.subplots()


INFO - Step 107606, loss: 0.46085518598556527
########## Evaluation ##########
Timestep: 107606 Average reward is -0.0604
INFO - Step 107750, loss: 0.45894476771354675
########## Evaluation ##########
Timestep: 107750 Average reward is -0.0594
INFO - Step 107903, loss: 0.42227125167846684
########## Evaluation ##########
Timestep: 107903 Average reward is -0.0561
INFO - Step 108060, loss: 0.86716890335083016
########## Evaluation ##########
Timestep: 108060 Average reward is -0.0676
INFO - Step 108200, loss: 0.71887797117233283
INFO - Copied model parameters to target network.
INFO - Step 108211, loss: 0.38999366760253906
########## Evaluation ##########
Timestep: 108211 Average reward is -0.0574
INFO - Step 108361, loss: 0.51949417591094977
########## Evaluation ##########
Timestep: 108361 Average reward is -0.0609
INFO - Step 108514, loss: 0.38415575027465825
########## Evaluation ##########
Timestep: 108514 Average reward is -0.0767
INFO - Step 108673, loss: 0.38672709465026855
####

  fig, ax = plt.subplots()


INFO - Step 109137, loss: 0.38357141613960266
########## Evaluation ##########
Timestep: 109137 Average reward is -0.0578
INFO - Step 109200, loss: 0.44780027866363525
INFO - Copied model parameters to target network.
INFO - Step 109293, loss: 0.57739639282226565
########## Evaluation ##########
Timestep: 109293 Average reward is -0.0795
INFO - Step 109456, loss: 0.52253341674804694
########## Evaluation ##########
Timestep: 109456 Average reward is -0.0616
INFO - Step 109614, loss: 0.38866862654685974
########## Evaluation ##########
Timestep: 109614 Average reward is -0.0615
INFO - Step 109770, loss: 0.42592817544937134
########## Evaluation ##########
Timestep: 109770 Average reward is -0.0629
INFO - Step 109925, loss: 0.48615795373916626
########## Evaluation ##########
Timestep: 109925 Average reward is -0.0616
INFO - Step 110074, loss: 0.45036277174949646
########## Evaluation ##########
Timestep: 110074 Average reward is -0.069
INFO - Step 110200, loss: 0.47676575183868413
INFO 

  fig, ax = plt.subplots()


INFO - Step 110677, loss: 0.51089203357696534
########## Evaluation ##########
Timestep: 110677 Average reward is -0.0614
INFO - Step 110819, loss: 0.41312187910079956
########## Evaluation ##########
Timestep: 110819 Average reward is -0.0706
INFO - Step 110968, loss: 0.59915643930435186
########## Evaluation ##########
Timestep: 110968 Average reward is -0.0796
INFO - Step 111121, loss: 0.44711196422576904
########## Evaluation ##########
Timestep: 111121 Average reward is -0.068
INFO - Step 111200, loss: 0.53525292873382576
INFO - Copied model parameters to target network.
INFO - Step 111279, loss: 0.43553531169891364
########## Evaluation ##########
Timestep: 111279 Average reward is -0.0691
INFO - Step 111432, loss: 0.52443099021911623
########## Evaluation ##########
Timestep: 111432 Average reward is -0.0816
INFO - Step 111584, loss: 0.43804451823234564
########## Evaluation ##########
Timestep: 111584 Average reward is -0.0611
INFO - Step 111741, loss: 0.58061677217483526
#####

  fig, ax = plt.subplots()


INFO - Step 112200, loss: 0.44514483213424683
INFO - Copied model parameters to target network.
INFO - Step 112221, loss: 0.54909002780914316
########## Evaluation ##########
Timestep: 112221 Average reward is -0.0536
INFO - Step 112365, loss: 0.56103754043579164
########## Evaluation ##########
Timestep: 112365 Average reward is -0.0563
INFO - Step 112525, loss: 0.49185746908187866
########## Evaluation ##########
Timestep: 112525 Average reward is -0.0785
INFO - Step 112683, loss: 0.47295725345611575
########## Evaluation ##########
Timestep: 112683 Average reward is -0.0734
INFO - Step 112834, loss: 0.54254209995269786
########## Evaluation ##########
Timestep: 112834 Average reward is -0.0602
INFO - Step 112978, loss: 0.48008161783218384
########## Evaluation ##########
Timestep: 112978 Average reward is -0.0859
INFO - Step 113130, loss: 0.53497552871704186
########## Evaluation ##########
Timestep: 113130 Average reward is -0.0681
INFO - Step 113200, loss: 0.48216640949249273
INFO

  fig, ax = plt.subplots()


INFO - Step 113734, loss: 0.48919147253036573
########## Evaluation ##########
Timestep: 113734 Average reward is -0.0684
INFO - Step 113887, loss: 0.48505076766014185
########## Evaluation ##########
Timestep: 113887 Average reward is -0.0666
INFO - Step 114044, loss: 0.58285504579544073
########## Evaluation ##########
Timestep: 114044 Average reward is -0.0536
INFO - Step 114196, loss: 0.43180578947067265
########## Evaluation ##########
Timestep: 114196 Average reward is -0.0549
INFO - Step 114200, loss: 0.43795740604400635
INFO - Copied model parameters to target network.
INFO - Step 114358, loss: 0.43605694174766546
########## Evaluation ##########
Timestep: 114358 Average reward is -0.0621
INFO - Step 114503, loss: 0.52994024753570564
########## Evaluation ##########
Timestep: 114503 Average reward is -0.0668
INFO - Step 114658, loss: 0.44116857647895813
########## Evaluation ##########
Timestep: 114658 Average reward is -0.0685
INFO - Step 114815, loss: 0.53521692752838134
####

  fig, ax = plt.subplots()


INFO - Step 115200, loss: 0.35432827472686774
INFO - Copied model parameters to target network.
INFO - Step 115259, loss: 0.49762979149818426
########## Evaluation ##########
Timestep: 115259 Average reward is -0.0628
INFO - Step 115411, loss: 0.43828254938125617
########## Evaluation ##########
Timestep: 115411 Average reward is -0.0624
INFO - Step 115561, loss: 0.50187540054321294
########## Evaluation ##########
Timestep: 115561 Average reward is -0.0834
INFO - Step 115709, loss: 0.70300734043121344
########## Evaluation ##########
Timestep: 115709 Average reward is -0.0705
INFO - Step 115865, loss: 0.39999109506607056
########## Evaluation ##########
Timestep: 115865 Average reward is -0.0626
INFO - Step 116010, loss: 0.34830814599990845
########## Evaluation ##########
Timestep: 116010 Average reward is -0.0756
INFO - Step 116163, loss: 0.33256441354751587
########## Evaluation ##########
Timestep: 116163 Average reward is -0.0802
INFO - Step 116200, loss: 0.40451222658157356
INFO

  fig, ax = plt.subplots()



########## Evaluation ##########
Timestep: 116608 Average reward is -0.0773
INFO - Step 116757, loss: 0.65079391002655033
########## Evaluation ##########
Timestep: 116757 Average reward is -0.0625
INFO - Step 116908, loss: 0.56244534254074154
########## Evaluation ##########
Timestep: 116908 Average reward is -0.0617
INFO - Step 117076, loss: 0.36107361316680914
########## Evaluation ##########
Timestep: 117076 Average reward is -0.0669
INFO - Step 117200, loss: 0.41664358973503113
INFO - Copied model parameters to target network.
INFO - Step 117233, loss: 0.48362904787063633
########## Evaluation ##########
Timestep: 117233 Average reward is -0.0757
INFO - Step 117377, loss: 0.56653404235839846
########## Evaluation ##########
Timestep: 117377 Average reward is -0.0642
INFO - Step 117523, loss: 0.52650660276412967
########## Evaluation ##########
Timestep: 117523 Average reward is -0.0606
INFO - Step 117673, loss: 0.39014303684234626
########## Evaluation ##########
Timestep: 117673

  fig, ax = plt.subplots()


INFO - Step 118200, loss: 0.33878499269485474
INFO - Copied model parameters to target network.
INFO - Step 118278, loss: 0.68876326084136966
########## Evaluation ##########
Timestep: 118278 Average reward is -0.0607
INFO - Step 118430, loss: 0.47570705413818365
########## Evaluation ##########
Timestep: 118430 Average reward is -0.0668
INFO - Step 118580, loss: 0.56239235401153564
########## Evaluation ##########
Timestep: 118580 Average reward is -0.0568
INFO - Step 118751, loss: 0.44936305284500123
########## Evaluation ##########
Timestep: 118751 Average reward is -0.0714
INFO - Step 118906, loss: 0.61495339870452887
########## Evaluation ##########
Timestep: 118906 Average reward is -0.0693
INFO - Step 119062, loss: 0.44296789169311523
########## Evaluation ##########
Timestep: 119062 Average reward is -0.085
INFO - Step 119200, loss: 0.38169294595718384
INFO - Copied model parameters to target network.
INFO - Step 119224, loss: 0.48126223683357246
########## Evaluation #########

  fig, ax = plt.subplots()


INFO - Step 119833, loss: 0.59937405586242687
########## Evaluation ##########
Timestep: 119833 Average reward is -0.0704
INFO - Step 119987, loss: 0.39220386743545537
########## Evaluation ##########
Timestep: 119987 Average reward is -0.07
INFO - Step 120137, loss: 0.39947149157524113
########## Evaluation ##########
Timestep: 120137 Average reward is -0.0611
INFO - Step 120200, loss: 0.38719910383224493
INFO - Copied model parameters to target network.
INFO - Step 120302, loss: 0.48215237259864807
########## Evaluation ##########
Timestep: 120302 Average reward is -0.0836
INFO - Step 120453, loss: 0.46841278672218323
########## Evaluation ##########
Timestep: 120453 Average reward is -0.0542
INFO - Step 120604, loss: 0.40136739611625674
########## Evaluation ##########
Timestep: 120604 Average reward is -0.0638
INFO - Step 120755, loss: 0.38844081759452824
########## Evaluation ##########
Timestep: 120755 Average reward is -0.0664
INFO - Step 120907, loss: 0.49618574976921084
######

  fig, ax = plt.subplots()


INFO - Step 121348, loss: 0.49623614549636846
########## Evaluation ##########
Timestep: 121348 Average reward is -0.0767
INFO - Step 121492, loss: 0.40772151947021484
########## Evaluation ##########
Timestep: 121492 Average reward is -0.0762
INFO - Step 121642, loss: 0.53061544895172123
########## Evaluation ##########
Timestep: 121642 Average reward is -0.0803
INFO - Step 121799, loss: 0.40505635738372856
########## Evaluation ##########
Timestep: 121799 Average reward is -0.0717
INFO - Step 121963, loss: 0.47124019265174866
########## Evaluation ##########
Timestep: 121963 Average reward is -0.0651
INFO - Step 122110, loss: 0.31276518106460575
########## Evaluation ##########
Timestep: 122110 Average reward is -0.0895
INFO - Step 122200, loss: 0.49699580669403076
INFO - Copied model parameters to target network.
INFO - Step 122269, loss: 0.47803923487663275
########## Evaluation ##########
Timestep: 122269 Average reward is -0.0618
INFO - Step 122429, loss: 0.40743955969810486
####

  fig, ax = plt.subplots()


INFO - Step 122883, loss: 0.36049929261207587
########## Evaluation ##########
Timestep: 122883 Average reward is -0.0654
INFO - Step 123041, loss: 0.44536340236663823
########## Evaluation ##########
Timestep: 123041 Average reward is -0.0529
INFO - Step 123194, loss: 0.41693204641342163
########## Evaluation ##########
Timestep: 123194 Average reward is -0.0631
INFO - Step 123200, loss: 0.48173597455024727
INFO - Copied model parameters to target network.
INFO - Step 123348, loss: 0.37543982267379765
########## Evaluation ##########
Timestep: 123348 Average reward is -0.0666
INFO - Step 123497, loss: 0.41385534405708313
########## Evaluation ##########
Timestep: 123497 Average reward is -0.0718
INFO - Step 123639, loss: 0.74339556694030766
########## Evaluation ##########
Timestep: 123639 Average reward is -0.0723
INFO - Step 123785, loss: 0.62064325809478767
########## Evaluation ##########
Timestep: 123785 Average reward is -0.0537
INFO - Step 123933, loss: 0.39228525757789617
####

  fig, ax = plt.subplots()


INFO - Step 124379, loss: 0.37269541621208197
########## Evaluation ##########
Timestep: 124379 Average reward is -0.0521
INFO - Step 124531, loss: 0.30411651730537415
########## Evaluation ##########
Timestep: 124531 Average reward is -0.0519
INFO - Step 124685, loss: 0.41286131739616394
########## Evaluation ##########
Timestep: 124685 Average reward is -0.0607
INFO - Step 124838, loss: 0.35917603969573975
########## Evaluation ##########
Timestep: 124838 Average reward is -0.0596
INFO - Step 125003, loss: 0.39105939865112305
########## Evaluation ##########
Timestep: 125003 Average reward is -0.0673
INFO - Step 125163, loss: 0.39197298884391785
########## Evaluation ##########
Timestep: 125163 Average reward is -0.0635
INFO - Step 125200, loss: 0.49490797519683843
INFO - Copied model parameters to target network.
INFO - Step 125305, loss: 0.75427180528640753
########## Evaluation ##########
Timestep: 125305 Average reward is -0.0577
INFO - Step 125467, loss: 0.41248780488967896
####

  fig, ax = plt.subplots()


INFO - Step 125927, loss: 0.35672232508659363
########## Evaluation ##########
Timestep: 125927 Average reward is -0.064
INFO - Step 126078, loss: 0.57894027233123787
########## Evaluation ##########
Timestep: 126078 Average reward is -0.0605
INFO - Step 126200, loss: 0.43413582444190984
INFO - Copied model parameters to target network.
INFO - Step 126235, loss: 0.44163188338279724
########## Evaluation ##########
Timestep: 126235 Average reward is -0.0636
INFO - Step 126381, loss: 0.60171800851821977
########## Evaluation ##########
Timestep: 126381 Average reward is -0.0697
INFO - Step 126542, loss: 0.64230430126190196
########## Evaluation ##########
Timestep: 126542 Average reward is -0.0756
INFO - Step 126691, loss: 0.46981376409530645
########## Evaluation ##########
Timestep: 126691 Average reward is -0.0537
INFO - Step 126847, loss: 0.42329850792884827
########## Evaluation ##########
Timestep: 126847 Average reward is -0.0578
INFO - Step 127006, loss: 0.58803629875183164
#####

  fig, ax = plt.subplots()


INFO - Step 127463, loss: 0.54954743385314946
########## Evaluation ##########
Timestep: 127463 Average reward is -0.0535
INFO - Step 127625, loss: 0.37443736195564276
########## Evaluation ##########
Timestep: 127625 Average reward is -0.0694
INFO - Step 127779, loss: 0.35603874921798706
########## Evaluation ##########
Timestep: 127779 Average reward is -0.0667
INFO - Step 127941, loss: 0.52613770961761475
########## Evaluation ##########
Timestep: 127941 Average reward is -0.0527
INFO - Step 128087, loss: 0.41663748025894165
########## Evaluation ##########
Timestep: 128087 Average reward is -0.0748
INFO - Step 128200, loss: 0.51612889766693124
INFO - Copied model parameters to target network.
INFO - Step 128244, loss: 0.49430549144744873
########## Evaluation ##########
Timestep: 128244 Average reward is -0.0668
INFO - Step 128400, loss: 0.43417629599571236
########## Evaluation ##########
Timestep: 128400 Average reward is -0.0675
INFO - Step 128541, loss: 0.49465176463127136
####

  fig, ax = plt.subplots()


INFO - Step 128985, loss: 0.70098745822906494
########## Evaluation ##########
Timestep: 128985 Average reward is -0.059
INFO - Step 129135, loss: 0.57981163263320926
########## Evaluation ##########
Timestep: 129135 Average reward is -0.078
INFO - Step 129200, loss: 0.54529684782028224
INFO - Copied model parameters to target network.
INFO - Step 129296, loss: 0.41626361012458865
########## Evaluation ##########
Timestep: 129296 Average reward is -0.0727
INFO - Step 129437, loss: 0.42341178655624396
########## Evaluation ##########
Timestep: 129437 Average reward is -0.0625
INFO - Step 129595, loss: 0.42344707250595096
########## Evaluation ##########
Timestep: 129595 Average reward is -0.0661
INFO - Step 129748, loss: 0.40579217672348025
########## Evaluation ##########
Timestep: 129748 Average reward is -0.0584
INFO - Step 129900, loss: 0.50367951393127447
########## Evaluation ##########
Timestep: 129900 Average reward is -0.0406
INFO - Step 130060, loss: 0.33220005035400393
######

  fig, ax = plt.subplots()


INFO - Step 130513, loss: 0.55558049678802496
########## Evaluation ##########
Timestep: 130513 Average reward is -0.0593
INFO - Step 130653, loss: 0.46155944466590885
########## Evaluation ##########
Timestep: 130653 Average reward is -0.075
INFO - Step 130805, loss: 0.46946910023689276
########## Evaluation ##########
Timestep: 130805 Average reward is -0.07
INFO - Step 130955, loss: 0.37082186341285706
########## Evaluation ##########
Timestep: 130955 Average reward is -0.0726
INFO - Step 131092, loss: 0.57830137014389045
########## Evaluation ##########
Timestep: 131092 Average reward is -0.0642
INFO - Step 131200, loss: 0.60514587163925176
INFO - Copied model parameters to target network.
INFO - Step 131231, loss: 0.70979297161102374
########## Evaluation ##########
Timestep: 131231 Average reward is -0.0481
INFO - Step 131377, loss: 0.53628492355346686
########## Evaluation ##########
Timestep: 131377 Average reward is -0.073
INFO - Step 131512, loss: 0.46082979440689087
########

  fig, ax = plt.subplots()


INFO - Step 131945, loss: 0.52327960729599826
########## Evaluation ##########
Timestep: 131945 Average reward is -0.0601
INFO - Step 132083, loss: 0.54829645156860355
########## Evaluation ##########
Timestep: 132083 Average reward is -0.0691
INFO - Step 132200, loss: 0.66216534376144414
INFO - Copied model parameters to target network.
INFO - Step 132224, loss: 0.50649785995483455
########## Evaluation ##########
Timestep: 132224 Average reward is -0.0615
INFO - Step 132369, loss: 0.50983351469039925
########## Evaluation ##########
Timestep: 132369 Average reward is -0.0852
INFO - Step 132511, loss: 0.59243482351303135
########## Evaluation ##########
Timestep: 132511 Average reward is -0.0507
INFO - Step 132653, loss: 0.63157236576080326
########## Evaluation ##########
Timestep: 132653 Average reward is -0.0672
INFO - Step 132800, loss: 0.59875822067260745
########## Evaluation ##########
Timestep: 132800 Average reward is -0.0567
INFO - Step 132942, loss: 0.46161460876464844
####

  fig, ax = plt.subplots()


INFO - Step 133375, loss: 0.49066561460494995
########## Evaluation ##########
Timestep: 133375 Average reward is -0.0735
INFO - Step 133513, loss: 0.66357785463333135
########## Evaluation ##########
Timestep: 133513 Average reward is -0.0825
INFO - Step 133651, loss: 0.76327556371688846
########## Evaluation ##########
Timestep: 133651 Average reward is -0.0598
INFO - Step 133806, loss: 0.47257822751998984
########## Evaluation ##########
Timestep: 133806 Average reward is -0.091
INFO - Step 133946, loss: 0.67900419235229496
########## Evaluation ##########
Timestep: 133946 Average reward is -0.0752
INFO - Step 134094, loss: 0.46965652704238894
########## Evaluation ##########
Timestep: 134094 Average reward is -0.0801
INFO - Step 134200, loss: 0.52505123615264897
INFO - Copied model parameters to target network.
INFO - Step 134248, loss: 0.49214285612106323
########## Evaluation ##########
Timestep: 134248 Average reward is -0.0746
INFO - Step 134386, loss: 0.57818692922592166
#####

  fig, ax = plt.subplots()


INFO - Step 134800, loss: 0.42568057775497437
########## Evaluation ##########
Timestep: 134800 Average reward is -0.0832
INFO - Step 134950, loss: 0.64061272144317637
########## Evaluation ##########
Timestep: 134950 Average reward is -0.0736
INFO - Step 135090, loss: 0.52753096818923957
########## Evaluation ##########
Timestep: 135090 Average reward is -0.0734
INFO - Step 135200, loss: 0.71932172775268557
INFO - Copied model parameters to target network.
INFO - Step 135226, loss: 0.55889117717742924
########## Evaluation ##########
Timestep: 135226 Average reward is -0.0827
INFO - Step 135364, loss: 0.34687554836273193
########## Evaluation ##########
Timestep: 135364 Average reward is -0.0698
INFO - Step 135506, loss: 0.30114924907684326
########## Evaluation ##########
Timestep: 135506 Average reward is -0.0645
INFO - Step 135654, loss: 0.66779845952987676
########## Evaluation ##########
Timestep: 135654 Average reward is -0.0657
INFO - Step 135798, loss: 0.40617409348487854
####

  fig, ax = plt.subplots()


INFO - Step 136200, loss: 0.37009328603744507
INFO - Copied model parameters to target network.
INFO - Step 136229, loss: 0.67435896396636966
########## Evaluation ##########
Timestep: 136229 Average reward is -0.0752
INFO - Step 136375, loss: 0.46438086032867434
########## Evaluation ##########
Timestep: 136375 Average reward is -0.0518
INFO - Step 136530, loss: 0.35936087369918823
########## Evaluation ##########
Timestep: 136530 Average reward is -0.0712
INFO - Step 136674, loss: 0.45948582887649536
########## Evaluation ##########
Timestep: 136674 Average reward is -0.06
INFO - Step 136823, loss: 0.55586063861846925
########## Evaluation ##########
Timestep: 136823 Average reward is -0.0679
INFO - Step 136979, loss: 0.70855218172073365
########## Evaluation ##########
Timestep: 136979 Average reward is -0.0853
INFO - Step 137124, loss: 0.44739627838134766
########## Evaluation ##########
Timestep: 137124 Average reward is -0.0747
INFO - Step 137200, loss: 0.39070099592208864
INFO -

  fig, ax = plt.subplots()


INFO - Step 137696, loss: 0.49011802673339844
########## Evaluation ##########
Timestep: 137696 Average reward is -0.0768
INFO - Step 137841, loss: 0.62921941280364994
########## Evaluation ##########
Timestep: 137841 Average reward is -0.083
INFO - Step 137978, loss: 0.40309393405914307
########## Evaluation ##########
Timestep: 137978 Average reward is -0.0757
INFO - Step 138118, loss: 0.40022641420364384
########## Evaluation ##########
Timestep: 138118 Average reward is -0.081
INFO - Step 138200, loss: 0.43313392996788025
INFO - Copied model parameters to target network.
INFO - Step 138266, loss: 0.56008732318878174
########## Evaluation ##########
Timestep: 138266 Average reward is -0.0651
INFO - Step 138411, loss: 0.57651472091674875
########## Evaluation ##########
Timestep: 138411 Average reward is -0.0638
INFO - Step 138557, loss: 0.56804370880126954
########## Evaluation ##########
Timestep: 138557 Average reward is -0.0504
INFO - Step 138710, loss: 0.34653538465499884
######

  fig, ax = plt.subplots()


INFO - Step 139139, loss: 0.46438378095626836
########## Evaluation ##########
Timestep: 139139 Average reward is -0.0751
INFO - Step 139200, loss: 0.66881471872329717
INFO - Copied model parameters to target network.
INFO - Step 139280, loss: 0.53832173347473145
########## Evaluation ##########
Timestep: 139280 Average reward is -0.0706
INFO - Step 139417, loss: 0.37678369879722595
########## Evaluation ##########
Timestep: 139417 Average reward is -0.067
INFO - Step 139561, loss: 0.44346761703491217
########## Evaluation ##########
Timestep: 139561 Average reward is -0.059
INFO - Step 139718, loss: 0.41902551054954533
########## Evaluation ##########
Timestep: 139718 Average reward is -0.063
INFO - Step 139869, loss: 0.45747959613800056
########## Evaluation ##########
Timestep: 139869 Average reward is -0.0743
INFO - Step 140013, loss: 0.67421406507492073
########## Evaluation ##########
Timestep: 140013 Average reward is -0.0696
INFO - Step 140163, loss: 0.48637968301773075
#######

  fig, ax = plt.subplots()


INFO - Step 140592, loss: 0.52200055122375497
########## Evaluation ##########
Timestep: 140592 Average reward is -0.0648
INFO - Step 140741, loss: 0.57507282495498666
########## Evaluation ##########
Timestep: 140741 Average reward is -0.0737
INFO - Step 140890, loss: 0.43761169910430918
########## Evaluation ##########
Timestep: 140890 Average reward is -0.0704
INFO - Step 141035, loss: 0.55071508884429936
########## Evaluation ##########
Timestep: 141035 Average reward is -0.074
INFO - Step 141189, loss: 0.53533589839935347
########## Evaluation ##########
Timestep: 141189 Average reward is -0.0681
INFO - Step 141200, loss: 0.30930083990097046
INFO - Copied model parameters to target network.
INFO - Step 141336, loss: 0.65378856658935556
########## Evaluation ##########
Timestep: 141336 Average reward is -0.0535
INFO - Step 141483, loss: 0.34521561861038213
########## Evaluation ##########
Timestep: 141483 Average reward is -0.0714
INFO - Step 141637, loss: 0.51717650890350343
#####

  fig, ax = plt.subplots()


INFO - Step 142097, loss: 0.47329145669937134
########## Evaluation ##########
Timestep: 142097 Average reward is -0.0629
INFO - Step 142200, loss: 0.52395802736282353
INFO - Copied model parameters to target network.
INFO - Step 142240, loss: 0.49721390008926396
########## Evaluation ##########
Timestep: 142240 Average reward is -0.0819
INFO - Step 142388, loss: 0.56284773349761966
########## Evaluation ##########
Timestep: 142388 Average reward is -0.0481
INFO - Step 142535, loss: 0.46432417631149293
########## Evaluation ##########
Timestep: 142535 Average reward is -0.0623
INFO - Step 142681, loss: 0.50723892450332646
########## Evaluation ##########
Timestep: 142681 Average reward is -0.062
INFO - Step 142831, loss: 0.76171475648884707
########## Evaluation ##########
Timestep: 142831 Average reward is -0.0643
INFO - Step 142975, loss: 0.46465560793876656
########## Evaluation ##########
Timestep: 142975 Average reward is -0.0579
INFO - Step 143127, loss: 0.47317129373550415
#####

  fig, ax = plt.subplots()


INFO - Step 143564, loss: 0.44408553838729867
########## Evaluation ##########
Timestep: 143564 Average reward is -0.0587
INFO - Step 143714, loss: 0.65062272548675545
########## Evaluation ##########
Timestep: 143714 Average reward is -0.0601
INFO - Step 143864, loss: 0.45213544368743896
########## Evaluation ##########
Timestep: 143864 Average reward is -0.0641
INFO - Step 144010, loss: 0.57094061374664315
########## Evaluation ##########
Timestep: 144010 Average reward is -0.0657
INFO - Step 144159, loss: 0.40791893005371094
########## Evaluation ##########
Timestep: 144159 Average reward is -0.0439
INFO - Step 144200, loss: 0.57232892513275156
INFO - Copied model parameters to target network.
INFO - Step 144307, loss: 0.64516651630401615
########## Evaluation ##########
Timestep: 144307 Average reward is -0.0513
INFO - Step 144447, loss: 0.40230810642242437
########## Evaluation ##########
Timestep: 144447 Average reward is -0.0541
INFO - Step 144598, loss: 0.65745484828948975
####

  fig, ax = plt.subplots()


INFO - Step 145041, loss: 0.46238833665847785
########## Evaluation ##########
Timestep: 145041 Average reward is -0.0539
INFO - Step 145200, loss: 0.49958604574203497
INFO - Copied model parameters to target network.
INFO - Step 145202, loss: 0.48040345311164856
########## Evaluation ##########
Timestep: 145202 Average reward is -0.0598
INFO - Step 145340, loss: 0.66955089569091874
########## Evaluation ##########
Timestep: 145340 Average reward is -0.0692
INFO - Step 145482, loss: 0.52363812923431467
########## Evaluation ##########
Timestep: 145482 Average reward is -0.0566
INFO - Step 145623, loss: 0.70788139104843144
########## Evaluation ##########
Timestep: 145623 Average reward is -0.0722
INFO - Step 145765, loss: 0.78978329896926887
########## Evaluation ##########
Timestep: 145765 Average reward is -0.0379
INFO - Step 145912, loss: 0.53200811147689824
########## Evaluation ##########
Timestep: 145912 Average reward is -0.05
INFO - Step 146064, loss: 0.71295547485351564
######

  fig, ax = plt.subplots()


INFO - Step 146506, loss: 0.51431256532669074
########## Evaluation ##########
Timestep: 146506 Average reward is -0.0707
INFO - Step 146664, loss: 0.36141109466552734
########## Evaluation ##########
Timestep: 146664 Average reward is -0.0717
INFO - Step 146811, loss: 0.56521242856979375
########## Evaluation ##########
Timestep: 146811 Average reward is -0.0556
INFO - Step 146960, loss: 0.54499763250350956
########## Evaluation ##########
Timestep: 146960 Average reward is -0.0654
INFO - Step 147109, loss: 0.53000694513320925
########## Evaluation ##########
Timestep: 147109 Average reward is -0.0572
INFO - Step 147200, loss: 0.39128875732421875
INFO - Copied model parameters to target network.
INFO - Step 147251, loss: 0.68242931365966877
########## Evaluation ##########
Timestep: 147251 Average reward is -0.0725
INFO - Step 147400, loss: 0.57760906219482427
########## Evaluation ##########
Timestep: 147400 Average reward is -0.0708
INFO - Step 147543, loss: 0.42184376716613775
####

  fig, ax = plt.subplots()


INFO - Step 147979, loss: 0.47035878896713257
########## Evaluation ##########
Timestep: 147979 Average reward is -0.0692
INFO - Step 148140, loss: 0.42808228731155396
########## Evaluation ##########
Timestep: 148140 Average reward is -0.0665
INFO - Step 148200, loss: 0.54018217325210577
INFO - Copied model parameters to target network.
INFO - Step 148295, loss: 0.53777819871902476
########## Evaluation ##########
Timestep: 148295 Average reward is -0.0539
INFO - Step 148444, loss: 0.27758455276489265
########## Evaluation ##########
Timestep: 148444 Average reward is -0.0673
INFO - Step 148589, loss: 0.49546515941619873
########## Evaluation ##########
Timestep: 148589 Average reward is -0.0632
INFO - Step 148725, loss: 0.42980456352233887
########## Evaluation ##########
Timestep: 148725 Average reward is -0.0746
INFO - Step 148866, loss: 0.46531027555465754
########## Evaluation ##########
Timestep: 148866 Average reward is -0.0481
INFO - Step 149013, loss: 0.55599606037139895
####

  fig, ax = plt.subplots()


INFO - Step 149459, loss: 0.53756630420684815
########## Evaluation ##########
Timestep: 149459 Average reward is -0.0513
INFO - Step 149607, loss: 0.58255898952484134
########## Evaluation ##########
Timestep: 149607 Average reward is -0.0534
INFO - Step 149752, loss: 0.50732445716857915
########## Evaluation ##########
Timestep: 149752 Average reward is -0.0625
INFO - Step 149894, loss: 0.58972597122192384
########## Evaluation ##########
Timestep: 149894 Average reward is -0.0733
INFO - Step 150039, loss: 0.59409987926483155
########## Evaluation ##########
Timestep: 150039 Average reward is -0.0545
INFO - Step 150181, loss: 0.65509796142578126
########## Evaluation ##########
Timestep: 150181 Average reward is -0.0723
INFO - Step 150200, loss: 0.63263845443725593
INFO - Copied model parameters to target network.
INFO - Step 150331, loss: 0.61679029464721683
########## Evaluation ##########
Timestep: 150331 Average reward is -0.0526
INFO - Step 150475, loss: 0.43826976418495185
####

  fig, ax = plt.subplots()


INFO - Step 150908, loss: 0.76791036128997834
########## Evaluation ##########
Timestep: 150908 Average reward is -0.0661
INFO - Step 151058, loss: 0.35899001359939575
########## Evaluation ##########
Timestep: 151058 Average reward is -0.0576
INFO - Step 151200, loss: 0.57595258951187137
INFO - Copied model parameters to target network.
INFO - Step 151207, loss: 0.5160151124000549
########## Evaluation ##########
Timestep: 151207 Average reward is -0.0686
INFO - Step 151356, loss: 0.45569089055061346
########## Evaluation ##########
Timestep: 151356 Average reward is -0.0529
INFO - Step 151498, loss: 0.63509863615036013
########## Evaluation ##########
Timestep: 151498 Average reward is -0.07
INFO - Step 151638, loss: 0.36997717618942263
########## Evaluation ##########
Timestep: 151638 Average reward is -0.0601
INFO - Step 151796, loss: 0.37494963407516483
########## Evaluation ##########
Timestep: 151796 Average reward is -0.0815
INFO - Step 151941, loss: 0.48037457466125493
#######

  fig, ax = plt.subplots()


INFO - Step 152386, loss: 0.56518197059631355
########## Evaluation ##########
Timestep: 152386 Average reward is -0.0526
INFO - Step 152540, loss: 0.31253355741500854
########## Evaluation ##########
Timestep: 152540 Average reward is -0.0514
INFO - Step 152688, loss: 0.48731714487075806
########## Evaluation ##########
Timestep: 152688 Average reward is -0.0509
INFO - Step 152838, loss: 0.39384818077087474
########## Evaluation ##########
Timestep: 152838 Average reward is -0.0813
INFO - Step 152979, loss: 0.47262632846832275
########## Evaluation ##########
Timestep: 152979 Average reward is -0.0773
INFO - Step 153125, loss: 0.65897142887115483
########## Evaluation ##########
Timestep: 153125 Average reward is -0.0591
INFO - Step 153200, loss: 0.58718991279602057
INFO - Copied model parameters to target network.
INFO - Step 153275, loss: 0.39759731292724617
########## Evaluation ##########
Timestep: 153275 Average reward is -0.063
INFO - Step 153422, loss: 0.49959355592727664
#####

  fig, ax = plt.subplots()


INFO - Step 153871, loss: 0.50641638040542684
########## Evaluation ##########
Timestep: 153871 Average reward is -0.0646
INFO - Step 154025, loss: 0.67287647724151613
########## Evaluation ##########
Timestep: 154025 Average reward is -0.0515
INFO - Step 154172, loss: 0.41248184442520143
########## Evaluation ##########
Timestep: 154172 Average reward is -0.0671
INFO - Step 154200, loss: 0.52751290798187267
INFO - Copied model parameters to target network.
INFO - Step 154325, loss: 0.56167650222778327
########## Evaluation ##########
Timestep: 154325 Average reward is -0.0531
INFO - Step 154475, loss: 0.45680829882621765
########## Evaluation ##########
Timestep: 154475 Average reward is -0.0672
INFO - Step 154625, loss: 0.52573835849761964
########## Evaluation ##########
Timestep: 154625 Average reward is -0.0519
INFO - Step 154775, loss: 0.45115858316421514
########## Evaluation ##########
Timestep: 154775 Average reward is -0.072
INFO - Step 154917, loss: 0.53863924741745657
#####

  fig, ax = plt.subplots()


INFO - Step 155353, loss: 0.36417400836944585
########## Evaluation ##########
Timestep: 155353 Average reward is -0.0802
INFO - Step 155493, loss: 0.46801236271858215
########## Evaluation ##########
Timestep: 155493 Average reward is -0.0676
INFO - Step 155641, loss: 0.53305542469024666
########## Evaluation ##########
Timestep: 155641 Average reward is -0.065
INFO - Step 155781, loss: 0.40838611125946045
########## Evaluation ##########
Timestep: 155781 Average reward is -0.055
INFO - Step 155918, loss: 0.49591082334518436
########## Evaluation ##########
Timestep: 155918 Average reward is -0.0691
INFO - Step 156056, loss: 0.46523904800415045
########## Evaluation ##########
Timestep: 156056 Average reward is -0.0381
INFO - Step 156200, loss: 0.64042896032333375
INFO - Copied model parameters to target network.
INFO - Step 156203, loss: 0.43583959341049194
########## Evaluation ##########
Timestep: 156203 Average reward is -0.0631
INFO - Step 156364, loss: 0.43105635046958923
######

  fig, ax = plt.subplots()


INFO - Step 156813, loss: 0.44258779287338257
########## Evaluation ##########
Timestep: 156813 Average reward is -0.062
INFO - Step 156964, loss: 0.65825819969177254
########## Evaluation ##########
Timestep: 156964 Average reward is -0.0722
INFO - Step 157118, loss: 0.50768017768859863
########## Evaluation ##########
Timestep: 157118 Average reward is -0.0503
INFO - Step 157200, loss: 0.41816478967666626
INFO - Copied model parameters to target network.
INFO - Step 157269, loss: 0.66216385364532474
########## Evaluation ##########
Timestep: 157269 Average reward is -0.0533
INFO - Step 157425, loss: 0.59060752391815195
########## Evaluation ##########
Timestep: 157425 Average reward is -0.0524
INFO - Step 157575, loss: 0.42698866128921513
########## Evaluation ##########
Timestep: 157575 Average reward is -0.0657
INFO - Step 157725, loss: 0.53187173604965217
########## Evaluation ##########
Timestep: 157725 Average reward is -0.0646
INFO - Step 157872, loss: 0.39910989999771126
#####

  fig, ax = plt.subplots()



INFO - Copied model parameters to target network.
INFO - Step 158297, loss: 0.49646681547164917
########## Evaluation ##########
Timestep: 158297 Average reward is -0.0434
INFO - Step 158435, loss: 0.48127609491348267
########## Evaluation ##########
Timestep: 158435 Average reward is -0.0444
INFO - Step 158586, loss: 0.50642621517181423
########## Evaluation ##########
Timestep: 158586 Average reward is -0.0733
INFO - Step 158745, loss: 0.57888054847717293
########## Evaluation ##########
Timestep: 158745 Average reward is -0.0613
INFO - Step 158899, loss: 0.55024468898773193
########## Evaluation ##########
Timestep: 158899 Average reward is -0.054
INFO - Step 159047, loss: 0.53936815261840825
########## Evaluation ##########
Timestep: 159047 Average reward is -0.0644
INFO - Step 159198, loss: 0.53053319454193126
########## Evaluation ##########
Timestep: 159198 Average reward is -0.0667
INFO - Step 159200, loss: 0.38016289472579956
INFO - Copied model parameters to target network.


  fig, ax = plt.subplots()


INFO - Step 159758, loss: 0.34375989437103276
########## Evaluation ##########
Timestep: 159758 Average reward is -0.0715
INFO - Step 159911, loss: 0.45554870367050174
########## Evaluation ##########
Timestep: 159911 Average reward is -0.0493
INFO - Step 160048, loss: 0.50078207254409794
########## Evaluation ##########
Timestep: 160048 Average reward is -0.0639
INFO - Step 160196, loss: 0.74214351177215585
########## Evaluation ##########
Timestep: 160196 Average reward is -0.0699
INFO - Step 160200, loss: 0.42994496226310736
INFO - Copied model parameters to target network.
INFO - Step 160337, loss: 0.43091046810150146
########## Evaluation ##########
Timestep: 160337 Average reward is -0.0748
INFO - Step 160493, loss: 0.59643054008483896
########## Evaluation ##########
Timestep: 160493 Average reward is -0.067
INFO - Step 160626, loss: 0.55259305238723754
########## Evaluation ##########
Timestep: 160626 Average reward is -0.0861
INFO - Step 160772, loss: 0.33512321114540164
#####

  fig, ax = plt.subplots()


INFO - Step 161189, loss: 0.55613952875137337
########## Evaluation ##########
Timestep: 161189 Average reward is -0.0712
INFO - Step 161200, loss: 0.45534420013427734
INFO - Copied model parameters to target network.
INFO - Step 161327, loss: 0.56281697750091553
########## Evaluation ##########
Timestep: 161327 Average reward is -0.0828
INFO - Step 161465, loss: 0.46022015810012824
########## Evaluation ##########
Timestep: 161465 Average reward is -0.0547
INFO - Step 161613, loss: 0.39802980422973633
########## Evaluation ##########
Timestep: 161613 Average reward is -0.0811
INFO - Step 161758, loss: 0.58430111408233647
########## Evaluation ##########
Timestep: 161758 Average reward is -0.0794
INFO - Step 161896, loss: 0.58853983879089365
########## Evaluation ##########
Timestep: 161896 Average reward is -0.0769
INFO - Step 162048, loss: 0.49803191423416143
########## Evaluation ##########
Timestep: 162048 Average reward is -0.0715
INFO - Step 162191, loss: 0.54577028751373294
####

  fig, ax = plt.subplots()


INFO - Step 162614, loss: 0.56357538700103766
########## Evaluation ##########
Timestep: 162614 Average reward is -0.0752
INFO - Step 162780, loss: 0.34978640079498294
########## Evaluation ##########
Timestep: 162780 Average reward is -0.0651
INFO - Step 162931, loss: 0.41322541236877445
########## Evaluation ##########
Timestep: 162931 Average reward is -0.0609
INFO - Step 163078, loss: 0.39609330892562866
########## Evaluation ##########
Timestep: 163078 Average reward is -0.0441
INFO - Step 163200, loss: 0.59611451625823975
INFO - Copied model parameters to target network.
INFO - Step 163209, loss: 0.47621375322341924
########## Evaluation ##########
Timestep: 163209 Average reward is -0.0592
INFO - Step 163351, loss: 0.53280240297317553
########## Evaluation ##########
Timestep: 163351 Average reward is -0.0425
INFO - Step 163493, loss: 0.66979247331619265
########## Evaluation ##########
Timestep: 163493 Average reward is -0.0541
INFO - Step 163627, loss: 0.50692820549011236
####

  fig, ax = plt.subplots()


INFO - Step 164048, loss: 0.33968612551689155
########## Evaluation ##########
Timestep: 164048 Average reward is -0.0744
INFO - Step 164188, loss: 0.46862599253654483
########## Evaluation ##########
Timestep: 164188 Average reward is -0.0751
INFO - Step 164200, loss: 0.42529553174972534
INFO - Copied model parameters to target network.
INFO - Step 164315, loss: 0.36423835158348083
########## Evaluation ##########
Timestep: 164315 Average reward is -0.0802
INFO - Step 164460, loss: 0.79109001159667976
########## Evaluation ##########
Timestep: 164460 Average reward is -0.0765
INFO - Step 164606, loss: 0.58646088838577274
########## Evaluation ##########
Timestep: 164606 Average reward is -0.0738
INFO - Step 164749, loss: 0.53934276103973397
########## Evaluation ##########
Timestep: 164749 Average reward is -0.0786
INFO - Step 164880, loss: 0.37657818198204046
########## Evaluation ##########
Timestep: 164880 Average reward is -0.0781
INFO - Step 165020, loss: 0.49768841266632085
####

  fig, ax = plt.subplots()


INFO - Step 165437, loss: 0.61436277627944956
########## Evaluation ##########
Timestep: 165437 Average reward is -0.0741
INFO - Step 165584, loss: 0.39011836051940927
########## Evaluation ##########
Timestep: 165584 Average reward is -0.0608
INFO - Step 165732, loss: 0.46268922090530396
########## Evaluation ##########
Timestep: 165732 Average reward is -0.06
INFO - Step 165878, loss: 0.51510888338088993
########## Evaluation ##########
Timestep: 165878 Average reward is -0.0547
INFO - Step 166019, loss: 0.51325994729995735
########## Evaluation ##########
Timestep: 166019 Average reward is -0.0544
INFO - Step 166160, loss: 0.47273159027099616
########## Evaluation ##########
Timestep: 166160 Average reward is -0.0567
INFO - Step 166200, loss: 0.66529196500778297
INFO - Copied model parameters to target network.
INFO - Step 166296, loss: 0.31546336412429815
########## Evaluation ##########
Timestep: 166296 Average reward is -0.0644
INFO - Step 166434, loss: 0.50022661685943627
######

  fig, ax = plt.subplots()


INFO - Step 166873, loss: 0.48706495761871346
########## Evaluation ##########
Timestep: 166873 Average reward is -0.0878
INFO - Step 167020, loss: 0.53105378150939943
########## Evaluation ##########
Timestep: 167020 Average reward is -0.066
INFO - Step 167170, loss: 0.62259149551391683
########## Evaluation ##########
Timestep: 167170 Average reward is -0.0533
INFO - Step 167200, loss: 0.44086587429046634
INFO - Copied model parameters to target network.
INFO - Step 167318, loss: 0.62817144393920945
########## Evaluation ##########
Timestep: 167318 Average reward is -0.0667
INFO - Step 167460, loss: 0.49573254585266113
########## Evaluation ##########
Timestep: 167460 Average reward is -0.067
INFO - Step 167606, loss: 0.57588237524032594
########## Evaluation ##########
Timestep: 167606 Average reward is -0.0725
INFO - Step 167758, loss: 0.51696264743804933
########## Evaluation ##########
Timestep: 167758 Average reward is -0.0652
INFO - Step 167916, loss: 0.47723507881164554
######

  fig, ax = plt.subplots()


INFO - Step 168344, loss: 0.50873315334320076
########## Evaluation ##########
Timestep: 168344 Average reward is -0.0718
INFO - Step 168487, loss: 0.56937211751937877
########## Evaluation ##########
Timestep: 168487 Average reward is -0.0679
INFO - Step 168629, loss: 0.64025866985321046
########## Evaluation ##########
Timestep: 168629 Average reward is -0.0662
INFO - Step 168779, loss: 0.48739618062973025
########## Evaluation ##########
Timestep: 168779 Average reward is -0.0575
INFO - Step 168918, loss: 0.57292604446411134
########## Evaluation ##########
Timestep: 168918 Average reward is -0.0612
INFO - Step 169061, loss: 0.68135029077529916
########## Evaluation ##########
Timestep: 169061 Average reward is -0.0688
INFO - Step 169200, loss: 0.46977311372756964
INFO - Copied model parameters to target network.
INFO - Step 169212, loss: 0.72007596492767336
########## Evaluation ##########
Timestep: 169212 Average reward is -0.0556
INFO - Step 169350, loss: 0.49523237347602844
####

  fig, ax = plt.subplots()


INFO - Step 169773, loss: 0.58791410923004155
########## Evaluation ##########
Timestep: 169773 Average reward is -0.0695
INFO - Step 169924, loss: 0.60281002521514897
########## Evaluation ##########
Timestep: 169924 Average reward is -0.0595
INFO - Step 170069, loss: 0.32717663049697876
########## Evaluation ##########
Timestep: 170069 Average reward is -0.0679
INFO - Step 170200, loss: 0.57165253162384035
INFO - Copied model parameters to target network.
INFO - Step 170224, loss: 0.61333215236663825
########## Evaluation ##########
Timestep: 170224 Average reward is -0.0603
INFO - Step 170371, loss: 0.49332919716835024
########## Evaluation ##########
Timestep: 170371 Average reward is -0.0657
INFO - Step 170515, loss: 0.62165498733520517
########## Evaluation ##########
Timestep: 170515 Average reward is -0.0496
INFO - Step 170667, loss: 0.45942676067352295
########## Evaluation ##########
Timestep: 170667 Average reward is -0.0673
INFO - Step 170807, loss: 0.50836253166198734
####

  fig, ax = plt.subplots()


INFO - Step 171200, loss: 0.45027458667755127
INFO - Copied model parameters to target network.
INFO - Step 171255, loss: 0.59791648387908944
########## Evaluation ##########
Timestep: 171255 Average reward is -0.0545
INFO - Step 171394, loss: 0.56147778034210225
########## Evaluation ##########
Timestep: 171394 Average reward is -0.0539
INFO - Step 171561, loss: 0.57205229997634896
########## Evaluation ##########
Timestep: 171561 Average reward is -0.0562
INFO - Step 171714, loss: 0.46099704504013067
########## Evaluation ##########
Timestep: 171714 Average reward is -0.0644
INFO - Step 171856, loss: 0.39808756113052375
########## Evaluation ##########
Timestep: 171856 Average reward is -0.0651
INFO - Step 172004, loss: 0.45818719267845154
########## Evaluation ##########
Timestep: 172004 Average reward is -0.0682
INFO - Step 172155, loss: 0.54305773973464977
########## Evaluation ##########
Timestep: 172155 Average reward is -0.0489
INFO - Step 172200, loss: 0.61686503887176515
INFO

  fig, ax = plt.subplots()


INFO - Step 172748, loss: 0.65860354900360116
########## Evaluation ##########
Timestep: 172748 Average reward is -0.0422
INFO - Step 172889, loss: 0.50607895851135256
########## Evaluation ##########
Timestep: 172889 Average reward is -0.0654
INFO - Step 173033, loss: 0.66015458106994637
########## Evaluation ##########
Timestep: 173033 Average reward is -0.0538
INFO - Step 173181, loss: 0.50200188159942637
########## Evaluation ##########
Timestep: 173181 Average reward is -0.0634
INFO - Step 173200, loss: 0.70843034982681277
INFO - Copied model parameters to target network.
INFO - Step 173342, loss: 0.48517018556594857
########## Evaluation ##########
Timestep: 173342 Average reward is -0.0734
INFO - Step 173481, loss: 0.44734388589859016
########## Evaluation ##########
Timestep: 173481 Average reward is -0.064
INFO - Step 173633, loss: 0.63835722208023077
########## Evaluation ##########
Timestep: 173633 Average reward is -0.0455
INFO - Step 173787, loss: 0.68522012233734135
#####

  fig, ax = plt.subplots()


INFO - Step 174200, loss: 0.66972720623016364
INFO - Copied model parameters to target network.
INFO - Step 174229, loss: 0.51798760890960696
########## Evaluation ##########
Timestep: 174229 Average reward is -0.0738
INFO - Step 174363, loss: 0.44291496276855475
########## Evaluation ##########
Timestep: 174363 Average reward is -0.0556
INFO - Step 174499, loss: 0.50193405151367194
########## Evaluation ##########
Timestep: 174499 Average reward is -0.0487
INFO - Step 174660, loss: 0.45904329419136055
########## Evaluation ##########
Timestep: 174660 Average reward is -0.0711
INFO - Step 174803, loss: 0.56144070625305183
########## Evaluation ##########
Timestep: 174803 Average reward is -0.0518
INFO - Step 174944, loss: 0.33515679836273193
########## Evaluation ##########
Timestep: 174944 Average reward is -0.0544
INFO - Step 175103, loss: 0.53191840648651124
########## Evaluation ##########
Timestep: 175103 Average reward is -0.0717
INFO - Step 175200, loss: 0.45725256204605126
INFO

  fig, ax = plt.subplots()


INFO - Step 175743, loss: 0.53092157840728766
########## Evaluation ##########
Timestep: 175743 Average reward is -0.0698
INFO - Step 175909, loss: 0.64929270744323734
########## Evaluation ##########
Timestep: 175909 Average reward is -0.072
INFO - Step 176062, loss: 0.63719242811203685
########## Evaluation ##########
Timestep: 176062 Average reward is -0.0581
INFO - Step 176200, loss: 0.42528009414672853
INFO - Copied model parameters to target network.
INFO - Step 176209, loss: 0.60005575418472295
########## Evaluation ##########
Timestep: 176209 Average reward is -0.0667
INFO - Step 176361, loss: 0.55717909336090095
########## Evaluation ##########
Timestep: 176361 Average reward is -0.08
INFO - Step 176517, loss: 0.59033942222595213
########## Evaluation ##########
Timestep: 176517 Average reward is -0.0469
INFO - Step 176660, loss: 0.48882728815078735
########## Evaluation ##########
Timestep: 176660 Average reward is -0.0467
INFO - Step 176813, loss: 0.44442558288574223
#######

  fig, ax = plt.subplots()


INFO - Step 177200, loss: 0.59789967536926277
INFO - Copied model parameters to target network.
INFO - Step 177248, loss: 0.48339557647705086
########## Evaluation ##########
Timestep: 177248 Average reward is -0.0361
INFO - Step 177394, loss: 0.49106502532958984
########## Evaluation ##########
Timestep: 177394 Average reward is -0.063
INFO - Step 177546, loss: 0.43368002772331244
########## Evaluation ##########
Timestep: 177546 Average reward is -0.068
INFO - Step 177695, loss: 0.48388767242431646
########## Evaluation ##########
Timestep: 177695 Average reward is -0.0649
INFO - Step 177846, loss: 0.76933717727661133
########## Evaluation ##########
Timestep: 177846 Average reward is -0.0646
INFO - Step 178004, loss: 0.37690308690071106
########## Evaluation ##########
Timestep: 178004 Average reward is -0.0846
INFO - Step 178144, loss: 0.61156892776489266
########## Evaluation ##########
Timestep: 178144 Average reward is -0.0546
INFO - Step 178200, loss: 0.50049167871475223
INFO -

  fig, ax = plt.subplots()


INFO - Step 178731, loss: 0.49001553654670715
########## Evaluation ##########
Timestep: 178731 Average reward is -0.0664
INFO - Step 178882, loss: 0.49557653069496155
########## Evaluation ##########
Timestep: 178882 Average reward is -0.0549
INFO - Step 179023, loss: 0.52917736768722536
########## Evaluation ##########
Timestep: 179023 Average reward is -0.0543
INFO - Step 179181, loss: 0.59347170591354376
########## Evaluation ##########
Timestep: 179181 Average reward is -0.0619
INFO - Step 179200, loss: 0.51546663045883183
INFO - Copied model parameters to target network.
INFO - Step 179337, loss: 0.54240787029266366
########## Evaluation ##########
Timestep: 179337 Average reward is -0.0786
INFO - Step 179488, loss: 0.56118428707122866
########## Evaluation ##########
Timestep: 179488 Average reward is -0.0579
INFO - Step 179645, loss: 0.45941516757011414
########## Evaluation ##########
Timestep: 179645 Average reward is -0.072
INFO - Step 179798, loss: 0.54810333251953126
#####

  fig, ax = plt.subplots()


INFO - Step 180200, loss: 0.45418509840965276
INFO - Copied model parameters to target network.
INFO - Step 180231, loss: 0.41210550069808964
########## Evaluation ##########
Timestep: 180231 Average reward is -0.0533
INFO - Step 180373, loss: 0.52805447578430184
########## Evaluation ##########
Timestep: 180373 Average reward is -0.07
INFO - Step 180533, loss: 0.38503205776214694
########## Evaluation ##########
Timestep: 180533 Average reward is -0.0577
INFO - Step 180679, loss: 0.46607613563537617
########## Evaluation ##########
Timestep: 180679 Average reward is -0.0579
INFO - Step 180831, loss: 0.55239975452423173
########## Evaluation ##########
Timestep: 180831 Average reward is -0.05
INFO - Step 180988, loss: 0.70584225654602056
########## Evaluation ##########
Timestep: 180988 Average reward is -0.0559
INFO - Step 181132, loss: 0.32815361022949224
########## Evaluation ##########
Timestep: 181132 Average reward is -0.0714
INFO - Step 181200, loss: 0.54714637994766244
INFO - C

  fig, ax = plt.subplots()


INFO - Step 181748, loss: 0.51729387044906627
########## Evaluation ##########
Timestep: 181748 Average reward is -0.043
INFO - Step 181899, loss: 0.53722494840621954
########## Evaluation ##########
Timestep: 181899 Average reward is -0.0538
INFO - Step 182052, loss: 0.50098025798797616
########## Evaluation ##########
Timestep: 182052 Average reward is -0.0698
INFO - Step 182200, loss: 0.86061823368072515
INFO - Copied model parameters to target network.
INFO - Step 182202, loss: 0.42424970865249634
########## Evaluation ##########
Timestep: 182202 Average reward is -0.0667
INFO - Step 182355, loss: 0.63017451763153084
########## Evaluation ##########
Timestep: 182355 Average reward is -0.0652
INFO - Step 182504, loss: 0.43576976656913765
########## Evaluation ##########
Timestep: 182504 Average reward is -0.0688
INFO - Step 182651, loss: 0.46249148249626166
########## Evaluation ##########
Timestep: 182651 Average reward is -0.0543
INFO - Step 182803, loss: 0.66590899229049685
#####

  fig, ax = plt.subplots()


INFO - Step 183200, loss: 0.37074366211891174
INFO - Copied model parameters to target network.
INFO - Step 183268, loss: 0.38627013564109883
########## Evaluation ##########
Timestep: 183268 Average reward is -0.0386
INFO - Step 183415, loss: 0.61222046613693246
########## Evaluation ##########
Timestep: 183415 Average reward is -0.0649
INFO - Step 183577, loss: 0.39618161320686344
########## Evaluation ##########
Timestep: 183577 Average reward is -0.0832
INFO - Step 183722, loss: 0.62387490272521974
########## Evaluation ##########
Timestep: 183722 Average reward is -0.0663
INFO - Step 183880, loss: 0.58511567115783696
########## Evaluation ##########
Timestep: 183880 Average reward is -0.0575
INFO - Step 184036, loss: 0.46552929282188416
########## Evaluation ##########
Timestep: 184036 Average reward is -0.0724
INFO - Step 184191, loss: 0.53654307126998993
########## Evaluation ##########
Timestep: 184191 Average reward is -0.0696
INFO - Step 184200, loss: 0.64246052503585827
INFO

  fig, ax = plt.subplots()


INFO - Step 184823, loss: 0.53520768880844127
########## Evaluation ##########
Timestep: 184823 Average reward is -0.0773
INFO - Step 184980, loss: 0.59609448909759525
########## Evaluation ##########
Timestep: 184980 Average reward is -0.0523
INFO - Step 185123, loss: 0.58396905660629274
########## Evaluation ##########
Timestep: 185123 Average reward is -0.0613
INFO - Step 185200, loss: 0.50120228528976446
INFO - Copied model parameters to target network.
INFO - Step 185272, loss: 0.37547469139099123
########## Evaluation ##########
Timestep: 185272 Average reward is -0.0577
INFO - Step 185418, loss: 0.52576780319213875
########## Evaluation ##########
Timestep: 185418 Average reward is -0.0573
INFO - Step 185558, loss: 0.47883880138397217
########## Evaluation ##########
Timestep: 185558 Average reward is -0.0643
INFO - Step 185709, loss: 0.38338306546211246
########## Evaluation ##########
Timestep: 185709 Average reward is -0.0533
INFO - Step 185852, loss: 0.70518946647644047
####

  fig, ax = plt.subplots()


INFO - Step 186200, loss: 0.47605800628662117
INFO - Copied model parameters to target network.
INFO - Step 186302, loss: 0.68181049823760993
########## Evaluation ##########
Timestep: 186302 Average reward is -0.0395
INFO - Step 186439, loss: 0.33099022507667543
########## Evaluation ##########
Timestep: 186439 Average reward is -0.0648
INFO - Step 186591, loss: 0.45108985900878906
########## Evaluation ##########
Timestep: 186591 Average reward is -0.0568
INFO - Step 186741, loss: 0.62957090139389047
########## Evaluation ##########
Timestep: 186741 Average reward is -0.0607
INFO - Step 186900, loss: 0.21569329500198364
########## Evaluation ##########
Timestep: 186900 Average reward is -0.0434
INFO - Step 187052, loss: 0.79608887434005743
########## Evaluation ##########
Timestep: 187052 Average reward is -0.0674
INFO - Step 187194, loss: 0.67291319370269785
########## Evaluation ##########
Timestep: 187194 Average reward is -0.0513
INFO - Step 187200, loss: 0.39809906482696533
INFO

  fig, ax = plt.subplots()


INFO - Step 187766, loss: 0.36959999799728394
########## Evaluation ##########
Timestep: 187766 Average reward is -0.0733
INFO - Step 187914, loss: 0.36701589822769165
########## Evaluation ##########
Timestep: 187914 Average reward is -0.0561
INFO - Step 188060, loss: 0.54024040699005133
########## Evaluation ##########
Timestep: 188060 Average reward is -0.0502
INFO - Step 188200, loss: 0.69085001945495633
INFO - Copied model parameters to target network.
INFO - Step 188212, loss: 0.42181885242462167
########## Evaluation ##########
Timestep: 188212 Average reward is -0.0608
INFO - Step 188360, loss: 0.44011032581329346
########## Evaluation ##########
Timestep: 188360 Average reward is -0.0557
INFO - Step 188499, loss: 0.32616531848907477
########## Evaluation ##########
Timestep: 188499 Average reward is -0.0782
INFO - Step 188659, loss: 0.54299736022949227
########## Evaluation ##########
Timestep: 188659 Average reward is -0.0602
INFO - Step 188792, loss: 0.50854831933975227
####

  fig, ax = plt.subplots()


INFO - Step 189200, loss: 0.43678405880928046
INFO - Copied model parameters to target network.
INFO - Step 189223, loss: 0.49533066153526306
########## Evaluation ##########
Timestep: 189223 Average reward is -0.0564
INFO - Step 189388, loss: 0.46845519542694097
########## Evaluation ##########
Timestep: 189388 Average reward is -0.0495
INFO - Step 189536, loss: 0.34877404570579536
########## Evaluation ##########
Timestep: 189536 Average reward is -0.062
INFO - Step 189695, loss: 0.55163317918777477
########## Evaluation ##########
Timestep: 189695 Average reward is -0.0565
INFO - Step 189854, loss: 0.61131465435028085
########## Evaluation ##########
Timestep: 189854 Average reward is -0.0495
INFO - Step 190009, loss: 0.60381305217742923
########## Evaluation ##########
Timestep: 190009 Average reward is -0.0849
INFO - Step 190162, loss: 0.49967092275619507
########## Evaluation ##########
Timestep: 190162 Average reward is -0.0598
INFO - Step 190200, loss: 0.44814172387123113
INFO 

  fig, ax = plt.subplots()


INFO - Step 190750, loss: 0.72475093603134164
########## Evaluation ##########
Timestep: 190750 Average reward is -0.059
INFO - Step 190892, loss: 0.54473316669464115
########## Evaluation ##########
Timestep: 190892 Average reward is -0.052
INFO - Step 191052, loss: 0.55238693952560424
########## Evaluation ##########
Timestep: 191052 Average reward is -0.0518
INFO - Step 191197, loss: 0.51471602916717534
########## Evaluation ##########
Timestep: 191197 Average reward is -0.0506
INFO - Step 191200, loss: 0.31665751338005066
INFO - Copied model parameters to target network.
INFO - Step 191351, loss: 0.57491570711135866
########## Evaluation ##########
Timestep: 191351 Average reward is -0.0643
INFO - Step 191501, loss: 0.40689224004745483
########## Evaluation ##########
Timestep: 191501 Average reward is -0.0761
INFO - Step 191644, loss: 0.42768064141273594
########## Evaluation ##########
Timestep: 191644 Average reward is -0.0447
INFO - Step 191785, loss: 0.44350513815879825
######

  fig, ax = plt.subplots()


INFO - Step 192200, loss: 0.42594242095947266
INFO - Copied model parameters to target network.
INFO - Step 192222, loss: 0.37326359748840335
########## Evaluation ##########
Timestep: 192222 Average reward is -0.0409
INFO - Step 192359, loss: 0.39251524209976196
########## Evaluation ##########
Timestep: 192359 Average reward is -0.0417
INFO - Step 192508, loss: 0.54706978797912627
########## Evaluation ##########
Timestep: 192508 Average reward is -0.0457
INFO - Step 192654, loss: 0.39154720306396484
########## Evaluation ##########
Timestep: 192654 Average reward is -0.0481
INFO - Step 192800, loss: 0.36273455619812015
########## Evaluation ##########
Timestep: 192800 Average reward is -0.0409
INFO - Step 192946, loss: 0.38562691211700443
########## Evaluation ##########
Timestep: 192946 Average reward is -0.052
INFO - Step 193092, loss: 0.67563140392303476
########## Evaluation ##########
Timestep: 193092 Average reward is -0.0499
INFO - Step 193200, loss: 0.35920977592468267
INFO 

  fig, ax = plt.subplots()


INFO - Step 193677, loss: 0.41239508986473083
########## Evaluation ##########
Timestep: 193677 Average reward is -0.0812
INFO - Step 193826, loss: 0.48404109477996826
########## Evaluation ##########
Timestep: 193826 Average reward is -0.0569
INFO - Step 193975, loss: 0.56518602371215826
########## Evaluation ##########
Timestep: 193975 Average reward is -0.0529
INFO - Step 194116, loss: 0.48612084984779364
########## Evaluation ##########
Timestep: 194116 Average reward is -0.0353
INFO - Step 194200, loss: 0.51960790157318126
INFO - Copied model parameters to target network.
INFO - Step 194273, loss: 0.47170171141624454
########## Evaluation ##########
Timestep: 194273 Average reward is -0.0573
INFO - Step 194418, loss: 0.50223970413208014
########## Evaluation ##########
Timestep: 194418 Average reward is -0.0603
INFO - Step 194588, loss: 0.48899152874946594
########## Evaluation ##########
Timestep: 194588 Average reward is -0.0668
INFO - Step 194724, loss: 0.45035636425018313
####

  fig, ax = plt.subplots()


INFO - Step 195160, loss: 0.58374118804931647
########## Evaluation ##########
Timestep: 195160 Average reward is -0.0768
INFO - Step 195200, loss: 0.40082001686096194
INFO - Copied model parameters to target network.
INFO - Step 195309, loss: 0.53644359111785896
########## Evaluation ##########
Timestep: 195309 Average reward is -0.0656
INFO - Step 195448, loss: 0.41805177927017216
########## Evaluation ##########
Timestep: 195448 Average reward is -0.057
INFO - Step 195603, loss: 0.63538539409637456
########## Evaluation ##########
Timestep: 195603 Average reward is -0.0543
INFO - Step 195744, loss: 0.51289677619934085
########## Evaluation ##########
Timestep: 195744 Average reward is -0.0521
INFO - Step 195894, loss: 0.62799692153930664
########## Evaluation ##########
Timestep: 195894 Average reward is -0.0615
INFO - Step 196046, loss: 0.47356802225112915
########## Evaluation ##########
Timestep: 196046 Average reward is -0.0666
INFO - Step 196195, loss: 0.64295059442520145
#####

  fig, ax = plt.subplots()


INFO - Step 196630, loss: 0.47119671106338554
########## Evaluation ##########
Timestep: 196630 Average reward is -0.0516
INFO - Step 196771, loss: 0.35827091336250305
########## Evaluation ##########
Timestep: 196771 Average reward is -0.0519
INFO - Step 196914, loss: 0.50807070732116746
########## Evaluation ##########
Timestep: 196914 Average reward is -0.0615
INFO - Step 197057, loss: 0.38233819603919983
########## Evaluation ##########
Timestep: 197057 Average reward is -0.0501
INFO - Step 197200, loss: 0.72041314840316774
INFO - Copied model parameters to target network.
INFO - Step 197206, loss: 0.47888290882110596
########## Evaluation ##########
Timestep: 197206 Average reward is -0.0648
INFO - Step 197367, loss: 0.55585068464279175
########## Evaluation ##########
Timestep: 197367 Average reward is -0.0473
INFO - Step 197511, loss: 0.50022411346435555
########## Evaluation ##########
Timestep: 197511 Average reward is -0.0534
INFO - Step 197669, loss: 0.36652708053588867
####

  fig, ax = plt.subplots()


INFO - Step 198111, loss: 0.43800279498100285
########## Evaluation ##########
Timestep: 198111 Average reward is -0.0472
INFO - Step 198200, loss: 0.48347562551498413
INFO - Copied model parameters to target network.
INFO - Step 198246, loss: 0.59718102216720586
########## Evaluation ##########
Timestep: 198246 Average reward is -0.0577
INFO - Step 198385, loss: 0.37312692403793335
########## Evaluation ##########
Timestep: 198385 Average reward is -0.0522
INFO - Step 198537, loss: 0.51446068286895756
########## Evaluation ##########
Timestep: 198537 Average reward is -0.0641
INFO - Step 198694, loss: 0.37949973344802856
########## Evaluation ##########
Timestep: 198694 Average reward is -0.0643
INFO - Step 198846, loss: 0.56597089767456055
########## Evaluation ##########
Timestep: 198846 Average reward is -0.055
INFO - Step 198984, loss: 0.44948130846023566
########## Evaluation ##########
Timestep: 198984 Average reward is -0.0497
INFO - Step 199133, loss: 0.54364299774169927
#####

  fig, ax = plt.subplots()


INFO - Step 199587, loss: 0.44689923524856576
########## Evaluation ##########
Timestep: 199587 Average reward is -0.068
INFO - Step 199730, loss: 0.48045086860656746
########## Evaluation ##########
Timestep: 199730 Average reward is -0.065
INFO - Step 199869, loss: 0.52868819236755375
########## Evaluation ##########
Timestep: 199869 Average reward is -0.0577
INFO - Step 200022, loss: 0.36159333586692815
########## Evaluation ##########
Timestep: 200022 Average reward is -0.0504
INFO - Step 200180, loss: 0.38076111674308777
########## Evaluation ##########
Timestep: 200180 Average reward is -0.0534
INFO - Step 200200, loss: 0.45756566524505615
INFO - Copied model parameters to target network.
INFO - Step 200328, loss: 0.50724375247955325
########## Evaluation ##########
Timestep: 200328 Average reward is -0.0744
INFO - Step 200473, loss: 0.31248769164085396
########## Evaluation ##########
Timestep: 200473 Average reward is -0.0489
INFO - Step 200620, loss: 0.48504012823104865
######

  fig, ax = plt.subplots()


INFO - Step 201037, loss: 0.60532540082931523
########## Evaluation ##########
Timestep: 201037 Average reward is -0.0541
INFO - Step 201181, loss: 0.53679579496383677
########## Evaluation ##########
Timestep: 201181 Average reward is -0.0573
INFO - Step 201200, loss: 0.49758175015449524
INFO - Copied model parameters to target network.
INFO - Step 201330, loss: 0.49616459012031555
########## Evaluation ##########
Timestep: 201330 Average reward is -0.0479
INFO - Step 201472, loss: 0.68035328388214114
########## Evaluation ##########
Timestep: 201472 Average reward is -0.0699
INFO - Step 201614, loss: 0.58511090278625495
########## Evaluation ##########
Timestep: 201614 Average reward is -0.0692
INFO - Step 201761, loss: 0.53102821111679084
########## Evaluation ##########
Timestep: 201761 Average reward is -0.0799
INFO - Step 201911, loss: 0.45748886466026306
########## Evaluation ##########
Timestep: 201911 Average reward is -0.0564
INFO - Step 202060, loss: 0.54346990585327154
####

  fig, ax = plt.subplots()


INFO - Step 202500, loss: 0.52143418788909916
########## Evaluation ##########
Timestep: 202500 Average reward is -0.0515
INFO - Step 202653, loss: 0.60367518663406374
########## Evaluation ##########
Timestep: 202653 Average reward is -0.0653
INFO - Step 202809, loss: 0.27444306015968324
########## Evaluation ##########
Timestep: 202809 Average reward is -0.0634
INFO - Step 202949, loss: 0.47212651371955874
########## Evaluation ##########
Timestep: 202949 Average reward is -0.0605
INFO - Step 203094, loss: 0.32432925701141366
########## Evaluation ##########
Timestep: 203094 Average reward is -0.0512
INFO - Step 203200, loss: 0.50472581386566163
INFO - Copied model parameters to target network.
INFO - Step 203239, loss: 0.50246375799179086
########## Evaluation ##########
Timestep: 203239 Average reward is -0.0498
INFO - Step 203386, loss: 0.45160028338432314
########## Evaluation ##########
Timestep: 203386 Average reward is -0.0603
INFO - Step 203536, loss: 0.38902670145034796
####

  fig, ax = plt.subplots()


INFO - Step 203975, loss: 0.44330430030822754
########## Evaluation ##########
Timestep: 203975 Average reward is -0.075
INFO - Step 204117, loss: 0.53046345710754415
########## Evaluation ##########
Timestep: 204117 Average reward is -0.0603
INFO - Step 204200, loss: 0.52265977859497076
INFO - Copied model parameters to target network.
INFO - Step 204259, loss: 0.63663876056671143
########## Evaluation ##########
Timestep: 204259 Average reward is -0.0462
INFO - Step 204411, loss: 0.46094185113906862
########## Evaluation ##########
Timestep: 204411 Average reward is -0.0526
INFO - Step 204577, loss: 0.55488491058349617
########## Evaluation ##########
Timestep: 204577 Average reward is -0.0601
INFO - Step 204717, loss: 0.46728950738906865
########## Evaluation ##########
Timestep: 204717 Average reward is -0.0694
INFO - Step 204869, loss: 0.46539300680160524
########## Evaluation ##########
Timestep: 204869 Average reward is -0.0613
INFO - Step 205020, loss: 0.39641463756561283
#####

  fig, ax = plt.subplots()


INFO - Step 205458, loss: 0.46151965856552124
########## Evaluation ##########
Timestep: 205458 Average reward is -0.061
INFO - Step 205611, loss: 0.44218760728836064
########## Evaluation ##########
Timestep: 205611 Average reward is -0.0624
INFO - Step 205755, loss: 0.56025755405426033
########## Evaluation ##########
Timestep: 205755 Average reward is -0.0585
INFO - Step 205902, loss: 0.51942163705825815
########## Evaluation ##########
Timestep: 205902 Average reward is -0.0652
INFO - Step 206058, loss: 0.54111850261688234
########## Evaluation ##########
Timestep: 206058 Average reward is -0.0386
INFO - Step 206200, loss: 0.59788465499877936
INFO - Copied model parameters to target network.
INFO - Step 206207, loss: 0.50388354063034066
########## Evaluation ##########
Timestep: 206207 Average reward is -0.0507
INFO - Step 206369, loss: 0.43732094764709476
########## Evaluation ##########
Timestep: 206369 Average reward is -0.0715
INFO - Step 206516, loss: 0.39813530445098877
#####

  fig, ax = plt.subplots()


INFO - Step 206951, loss: 0.43360024690628053
########## Evaluation ##########
Timestep: 206951 Average reward is -0.0572
INFO - Step 207103, loss: 0.44924938678741455
########## Evaluation ##########
Timestep: 207103 Average reward is -0.0535
INFO - Step 207200, loss: 0.40468305349349976
INFO - Copied model parameters to target network.
INFO - Step 207245, loss: 0.55139732360839846
########## Evaluation ##########
Timestep: 207245 Average reward is -0.0457
INFO - Step 207393, loss: 0.56045109033584653
########## Evaluation ##########
Timestep: 207393 Average reward is -0.0641
INFO - Step 207532, loss: 0.45770436525344854
########## Evaluation ##########
Timestep: 207532 Average reward is -0.055
INFO - Step 207670, loss: 0.35652810335159387
########## Evaluation ##########
Timestep: 207670 Average reward is -0.0441
INFO - Step 207828, loss: 0.33452183008193974
########## Evaluation ##########
Timestep: 207828 Average reward is -0.0777
INFO - Step 207969, loss: 0.39505150914192274
#####

  fig, ax = plt.subplots()


INFO - Step 208409, loss: 0.59938681125640876
########## Evaluation ##########
Timestep: 208409 Average reward is -0.0392
INFO - Step 208552, loss: 0.41117012500762947
########## Evaluation ##########
Timestep: 208552 Average reward is -0.0694
INFO - Step 208704, loss: 0.38216498494148254
########## Evaluation ##########
Timestep: 208704 Average reward is -0.0684
INFO - Step 208844, loss: 0.32205525040626526
########## Evaluation ##########
Timestep: 208844 Average reward is -0.0624
INFO - Step 208992, loss: 0.53154611587524415
########## Evaluation ##########
Timestep: 208992 Average reward is -0.0518
INFO - Step 209137, loss: 0.46691954135894775
########## Evaluation ##########
Timestep: 209137 Average reward is -0.0687
INFO - Step 209200, loss: 0.64881908893585256
INFO - Copied model parameters to target network.
INFO - Step 209283, loss: 0.46192041039466863
########## Evaluation ##########
Timestep: 209283 Average reward is -0.047
INFO - Step 209433, loss: 0.52284359931945867
#####

  fig, ax = plt.subplots()


INFO - Step 209886, loss: 0.41667729616165167
########## Evaluation ##########
Timestep: 209886 Average reward is -0.0671
INFO - Step 210032, loss: 0.40690851211547857
########## Evaluation ##########
Timestep: 210032 Average reward is -0.0503
INFO - Step 210186, loss: 0.50292223691940315
########## Evaluation ##########
Timestep: 210186 Average reward is -0.0496
INFO - Step 210200, loss: 0.55497920513153086
INFO - Copied model parameters to target network.
INFO - Step 210340, loss: 0.53168463706970215
########## Evaluation ##########
Timestep: 210340 Average reward is -0.0643
INFO - Step 210485, loss: 0.55844163894653325
########## Evaluation ##########
Timestep: 210485 Average reward is -0.0601
INFO - Step 210650, loss: 0.42735916376113894
########## Evaluation ##########
Timestep: 210650 Average reward is -0.0525
INFO - Step 210794, loss: 0.58146548271179254
########## Evaluation ##########
Timestep: 210794 Average reward is -0.0595
INFO - Step 210943, loss: 0.47967290878295926
####

  fig, ax = plt.subplots()


INFO - Step 211378, loss: 0.45516741275787354
########## Evaluation ##########
Timestep: 211378 Average reward is -0.0602
INFO - Step 211528, loss: 0.52634823322296145
########## Evaluation ##########
Timestep: 211528 Average reward is -0.0682
INFO - Step 211675, loss: 0.39496403932571416
########## Evaluation ##########
Timestep: 211675 Average reward is -0.0527
INFO - Step 211832, loss: 0.49245628714561465
########## Evaluation ##########
Timestep: 211832 Average reward is -0.0454
INFO - Step 211982, loss: 0.39266207814216614
########## Evaluation ##########
Timestep: 211982 Average reward is -0.0535
INFO - Step 212127, loss: 0.60325223207473754
########## Evaluation ##########
Timestep: 212127 Average reward is -0.0691
INFO - Step 212200, loss: 0.56660753488540653
INFO - Copied model parameters to target network.
INFO - Step 212284, loss: 0.39122617244720464
########## Evaluation ##########
Timestep: 212284 Average reward is -0.0728
INFO - Step 212435, loss: 0.58763289451599124
####

  fig, ax = plt.subplots()


INFO - Step 212889, loss: 0.79382681846618654
########## Evaluation ##########
Timestep: 212889 Average reward is -0.0468
INFO - Step 213032, loss: 0.48696768283843994
########## Evaluation ##########
Timestep: 213032 Average reward is -0.0422
INFO - Step 213190, loss: 0.57231247425079357
########## Evaluation ##########
Timestep: 213190 Average reward is -0.0716
INFO - Step 213200, loss: 0.43615978956222534
INFO - Copied model parameters to target network.
INFO - Step 213342, loss: 0.45067608356475834
########## Evaluation ##########
Timestep: 213342 Average reward is -0.0538
INFO - Step 213491, loss: 0.35078954696655273
########## Evaluation ##########
Timestep: 213491 Average reward is -0.0629
INFO - Step 213634, loss: 0.51853013038635254
########## Evaluation ##########
Timestep: 213634 Average reward is -0.063
INFO - Step 213776, loss: 0.42930656671524055
########## Evaluation ##########
Timestep: 213776 Average reward is -0.068
INFO - Step 213930, loss: 0.56195014715194746
######

  fig, ax = plt.subplots()


INFO - Step 214349, loss: 0.41852989792823797
########## Evaluation ##########
Timestep: 214349 Average reward is -0.0634
INFO - Step 214492, loss: 0.60519373416900636
########## Evaluation ##########
Timestep: 214492 Average reward is -0.0647
INFO - Step 214622, loss: 0.71757692098617556
########## Evaluation ##########
Timestep: 214622 Average reward is -0.0737
INFO - Step 214760, loss: 0.37380227446556095
########## Evaluation ##########
Timestep: 214760 Average reward is -0.0667
INFO - Step 214912, loss: 0.53063160181045535
########## Evaluation ##########
Timestep: 214912 Average reward is -0.0574
INFO - Step 215049, loss: 0.51171600818634035
########## Evaluation ##########
Timestep: 215049 Average reward is -0.0502
INFO - Step 215192, loss: 0.52473706007003786
########## Evaluation ##########
Timestep: 215192 Average reward is -0.0707
INFO - Step 215200, loss: 0.64793002605438237
INFO - Copied model parameters to target network.
INFO - Step 215345, loss: 0.55917274951934817
####

  fig, ax = plt.subplots()


INFO - Step 215769, loss: 0.32929909229278564
########## Evaluation ##########
Timestep: 215769 Average reward is -0.0691
INFO - Step 215911, loss: 0.39219456911087036
########## Evaluation ##########
Timestep: 215911 Average reward is -0.0784
INFO - Step 216066, loss: 0.43662211298942566
########## Evaluation ##########
Timestep: 216066 Average reward is -0.0636
INFO - Step 216200, loss: 0.42193830013275146
INFO - Copied model parameters to target network.
INFO - Step 216213, loss: 0.45797491073608497
########## Evaluation ##########
Timestep: 216213 Average reward is -0.0488
INFO - Step 216361, loss: 0.47685796022415165
########## Evaluation ##########
Timestep: 216361 Average reward is -0.0449
INFO - Step 216510, loss: 0.43247008323669434
########## Evaluation ##########
Timestep: 216510 Average reward is -0.0496
INFO - Step 216654, loss: 0.51943933963775633
########## Evaluation ##########
Timestep: 216654 Average reward is -0.0451
INFO - Step 216799, loss: 0.61096501350402834
####

  fig, ax = plt.subplots()


INFO - Step 217200, loss: 0.43677657842636114
INFO - Copied model parameters to target network.
INFO - Step 217247, loss: 0.41926604509353644
########## Evaluation ##########
Timestep: 217247 Average reward is -0.0854
INFO - Step 217397, loss: 0.45487374067306526
########## Evaluation ##########
Timestep: 217397 Average reward is -0.0671
INFO - Step 217543, loss: 0.44939896464347845
########## Evaluation ##########
Timestep: 217543 Average reward is -0.0568
INFO - Step 217686, loss: 0.58556580543518075
########## Evaluation ##########
Timestep: 217686 Average reward is -0.0588
INFO - Step 217833, loss: 0.45118603110313416
########## Evaluation ##########
Timestep: 217833 Average reward is -0.0567
INFO - Step 217972, loss: 0.44534963369369507
########## Evaluation ##########
Timestep: 217972 Average reward is -0.0464
INFO - Step 218128, loss: 0.42801156640052795
########## Evaluation ##########
Timestep: 218128 Average reward is -0.0746
INFO - Step 218200, loss: 0.54303699731826787
INFO

  fig, ax = plt.subplots()


INFO - Step 218723, loss: 0.53954529762268077
########## Evaluation ##########
Timestep: 218723 Average reward is -0.0627
INFO - Step 218868, loss: 0.41802448034286594
########## Evaluation ##########
Timestep: 218868 Average reward is -0.0601
INFO - Step 219015, loss: 0.33092945814132695
########## Evaluation ##########
Timestep: 219015 Average reward is -0.0548
INFO - Step 219161, loss: 0.48075360059738163
########## Evaluation ##########
Timestep: 219161 Average reward is -0.0558
INFO - Step 219200, loss: 0.37676471471786554
INFO - Copied model parameters to target network.
INFO - Step 219300, loss: 0.42637446522712715
########## Evaluation ##########
Timestep: 219300 Average reward is -0.0578
INFO - Step 219454, loss: 0.49137428402900696
########## Evaluation ##########
Timestep: 219454 Average reward is -0.0574
INFO - Step 219613, loss: 0.58852875232696533
########## Evaluation ##########
Timestep: 219613 Average reward is -0.0572
INFO - Step 219756, loss: 0.50046861171722416
####

  fig, ax = plt.subplots()


INFO - Step 220200, loss: 0.54192441701889047
INFO - Copied model parameters to target network.
INFO - Step 220210, loss: 0.62934148311614996
########## Evaluation ##########
Timestep: 220210 Average reward is -0.0504
INFO - Step 220359, loss: 0.61931222677230836
########## Evaluation ##########
Timestep: 220359 Average reward is -0.0602
INFO - Step 220502, loss: 0.53845053911209116
########## Evaluation ##########
Timestep: 220502 Average reward is -0.0451
INFO - Step 220646, loss: 0.50780820846557626
########## Evaluation ##########
Timestep: 220646 Average reward is -0.0457
INFO - Step 220796, loss: 0.42334926128387456
########## Evaluation ##########
Timestep: 220796 Average reward is -0.0695
INFO - Step 220948, loss: 0.57410162687301645
########## Evaluation ##########
Timestep: 220948 Average reward is -0.0636
INFO - Step 221096, loss: 0.55512940883636475
########## Evaluation ##########
Timestep: 221096 Average reward is -0.0662
INFO - Step 221200, loss: 0.43280598521232605
INFO

  fig, ax = plt.subplots()


INFO - Step 221710, loss: 0.48899394273757935
########## Evaluation ##########
Timestep: 221710 Average reward is -0.054
INFO - Step 221852, loss: 0.62481510639190674
########## Evaluation ##########
Timestep: 221852 Average reward is -0.0527
INFO - Step 222005, loss: 0.46495050191879274
########## Evaluation ##########
Timestep: 222005 Average reward is -0.0606
INFO - Step 222148, loss: 0.39639329910278326
########## Evaluation ##########
Timestep: 222148 Average reward is -0.0593
INFO - Step 222200, loss: 0.37002253532409673
INFO - Copied model parameters to target network.
INFO - Step 222294, loss: 0.55408734083175666
########## Evaluation ##########
Timestep: 222294 Average reward is -0.0626
INFO - Step 222453, loss: 0.61671614646911624
########## Evaluation ##########
Timestep: 222453 Average reward is -0.0556
INFO - Step 222616, loss: 0.46118712425231934
########## Evaluation ##########
Timestep: 222616 Average reward is -0.0323
INFO - Step 222756, loss: 0.33132904767990113
#####

  fig, ax = plt.subplots()


INFO - Step 223200, loss: 0.59157854318618777
INFO - Copied model parameters to target network.
INFO - Step 223220, loss: 0.56225723028182984
########## Evaluation ##########
Timestep: 223220 Average reward is -0.059
INFO - Step 223363, loss: 0.61160087585449224
########## Evaluation ##########
Timestep: 223363 Average reward is -0.0609
INFO - Step 223515, loss: 0.65188664197921756
########## Evaluation ##########
Timestep: 223515 Average reward is -0.0384
INFO - Step 223654, loss: 0.35507422685623175
########## Evaluation ##########
Timestep: 223654 Average reward is -0.0507
INFO - Step 223795, loss: 0.42126461863517763
########## Evaluation ##########
Timestep: 223795 Average reward is -0.0562
INFO - Step 223943, loss: 0.64539408683776863
########## Evaluation ##########
Timestep: 223943 Average reward is -0.0506
INFO - Step 224096, loss: 0.50789088010787965
########## Evaluation ##########
Timestep: 224096 Average reward is -0.0522
INFO - Step 224200, loss: 0.55781841278076173
INFO 

  fig, ax = plt.subplots()


INFO - Step 224694, loss: 0.52089834213256845
########## Evaluation ##########
Timestep: 224694 Average reward is -0.0582
INFO - Step 224836, loss: 0.60245645046234134
########## Evaluation ##########
Timestep: 224836 Average reward is -0.071
INFO - Step 224973, loss: 0.56953638792037964
########## Evaluation ##########
Timestep: 224973 Average reward is -0.0671
INFO - Step 225110, loss: 0.61358797550201424
########## Evaluation ##########
Timestep: 225110 Average reward is -0.0557
INFO - Step 225200, loss: 0.56455612182617196
INFO - Copied model parameters to target network.
INFO - Step 225258, loss: 0.54318171739578257
########## Evaluation ##########
Timestep: 225258 Average reward is -0.0536
INFO - Step 225404, loss: 0.42639222741127014
########## Evaluation ##########
Timestep: 225404 Average reward is -0.0522
INFO - Step 225560, loss: 0.47635129094123845
########## Evaluation ##########
Timestep: 225560 Average reward is -0.0583
INFO - Step 225707, loss: 0.61589431762695313
#####

  fig, ax = plt.subplots()


INFO - Step 226149, loss: 0.42352497577667236
########## Evaluation ##########
Timestep: 226149 Average reward is -0.0474
INFO - Step 226200, loss: 0.32500100135803227
INFO - Copied model parameters to target network.
INFO - Step 226305, loss: 0.36016729474067694
########## Evaluation ##########
Timestep: 226305 Average reward is -0.0581
INFO - Step 226451, loss: 0.42896398901939395
########## Evaluation ##########
Timestep: 226451 Average reward is -0.042
INFO - Step 226604, loss: 0.52636384963989267
########## Evaluation ##########
Timestep: 226604 Average reward is -0.0524
INFO - Step 226760, loss: 0.43815553188323975
########## Evaluation ##########
Timestep: 226760 Average reward is -0.0581
INFO - Step 226910, loss: 0.56689816713333134
########## Evaluation ##########
Timestep: 226910 Average reward is -0.0653
INFO - Step 227062, loss: 0.34604060649871826
########## Evaluation ##########
Timestep: 227062 Average reward is -0.0643
INFO - Step 227200, loss: 0.41545397043228154
INFO 

  fig, ax = plt.subplots()


INFO - Step 227653, loss: 0.47312682867050175
########## Evaluation ##########
Timestep: 227653 Average reward is -0.0493
INFO - Step 227792, loss: 0.61866021156311043
########## Evaluation ##########
Timestep: 227792 Average reward is -0.0693
INFO - Step 227935, loss: 0.46946004033088684
########## Evaluation ##########
Timestep: 227935 Average reward is -0.0698
INFO - Step 228070, loss: 0.51104271411895756
########## Evaluation ##########
Timestep: 228070 Average reward is -0.0674
INFO - Step 228200, loss: 0.47077387571334845
INFO - Copied model parameters to target network.
INFO - Step 228224, loss: 0.38785749673843384
########## Evaluation ##########
Timestep: 228224 Average reward is -0.0523
INFO - Step 228380, loss: 0.53660309314727784
########## Evaluation ##########
Timestep: 228380 Average reward is -0.0533
INFO - Step 228529, loss: 0.50705415010452275
########## Evaluation ##########
Timestep: 228529 Average reward is -0.0544
INFO - Step 228670, loss: 0.47194084525108343
####

  fig, ax = plt.subplots()


INFO - Step 229135, loss: 0.65304261445999153
########## Evaluation ##########
Timestep: 229135 Average reward is -0.0545
INFO - Step 229200, loss: 0.63568323850631716
INFO - Copied model parameters to target network.
INFO - Step 229284, loss: 0.40646135807037354
########## Evaluation ##########
Timestep: 229284 Average reward is -0.0638
INFO - Step 229436, loss: 0.59294193983078074
########## Evaluation ##########
Timestep: 229436 Average reward is -0.0558
INFO - Step 229596, loss: 0.53612446784973146
########## Evaluation ##########
Timestep: 229596 Average reward is -0.065
INFO - Step 229750, loss: 0.66412985324859627
########## Evaluation ##########
Timestep: 229750 Average reward is -0.063
INFO - Step 229898, loss: 0.35218974947929385
########## Evaluation ##########
Timestep: 229898 Average reward is -0.0475
INFO - Step 230053, loss: 0.33708047866821296
########## Evaluation ##########
Timestep: 230053 Average reward is -0.0518
INFO - Step 230189, loss: 0.43051636219024667
######

  fig, ax = plt.subplots()


INFO - Step 230638, loss: 0.43829342722892767
########## Evaluation ##########
Timestep: 230638 Average reward is -0.0664
INFO - Step 230790, loss: 0.52084761857986455
########## Evaluation ##########
Timestep: 230790 Average reward is -0.0551
INFO - Step 230940, loss: 0.54654824733734135
########## Evaluation ##########
Timestep: 230940 Average reward is -0.0445
INFO - Step 231084, loss: 0.43301981687545776
########## Evaluation ##########
Timestep: 231084 Average reward is -0.0671
INFO - Step 231200, loss: 0.67262881994247445
INFO - Copied model parameters to target network.
INFO - Step 231235, loss: 0.59025281667709355
########## Evaluation ##########
Timestep: 231235 Average reward is -0.0604
INFO - Step 231377, loss: 0.47072988748550415
########## Evaluation ##########
Timestep: 231377 Average reward is -0.0479
INFO - Step 231527, loss: 0.39064255356788635
########## Evaluation ##########
Timestep: 231527 Average reward is -0.0602
INFO - Step 231672, loss: 0.58208394050598142
####

  fig, ax = plt.subplots()


INFO - Step 232115, loss: 0.73126631975173954
########## Evaluation ##########
Timestep: 232115 Average reward is -0.0519
INFO - Step 232200, loss: 0.40828216075897217
INFO - Copied model parameters to target network.
INFO - Step 232269, loss: 0.38345274329185486
########## Evaluation ##########
Timestep: 232269 Average reward is -0.0542
INFO - Step 232415, loss: 0.48720732331275944
########## Evaluation ##########
Timestep: 232415 Average reward is -0.064
INFO - Step 232561, loss: 0.41754460334777834
########## Evaluation ##########
Timestep: 232561 Average reward is -0.0567
INFO - Step 232704, loss: 0.69426685571670535
########## Evaluation ##########
Timestep: 232704 Average reward is -0.0552
INFO - Step 232860, loss: 0.53475141525268555
########## Evaluation ##########
Timestep: 232860 Average reward is -0.063
INFO - Step 233008, loss: 0.43079286813735967
########## Evaluation ##########
Timestep: 233008 Average reward is -0.0605
INFO - Step 233155, loss: 0.57781499624252323
######

  fig, ax = plt.subplots()


INFO - Step 233596, loss: 0.62388753890991213
########## Evaluation ##########
Timestep: 233596 Average reward is -0.0544
INFO - Step 233741, loss: 0.62527537345886235
########## Evaluation ##########
Timestep: 233741 Average reward is -0.0597
INFO - Step 233881, loss: 0.34021726250648574
########## Evaluation ##########
Timestep: 233881 Average reward is -0.0541
INFO - Step 234041, loss: 0.37325859069824223
########## Evaluation ##########
Timestep: 234041 Average reward is -0.055
INFO - Step 234191, loss: 0.52659279108047496
########## Evaluation ##########
Timestep: 234191 Average reward is -0.0705
INFO - Step 234200, loss: 0.46208333969116217
INFO - Copied model parameters to target network.
INFO - Step 234344, loss: 0.54215955734252937
########## Evaluation ##########
Timestep: 234344 Average reward is -0.0563
INFO - Step 234493, loss: 0.61853098869323733
########## Evaluation ##########
Timestep: 234493 Average reward is -0.0628
INFO - Step 234649, loss: 0.62639838457107545
#####

  fig, ax = plt.subplots()


INFO - Step 235080, loss: 0.78928983211517335
########## Evaluation ##########
Timestep: 235080 Average reward is -0.0564
INFO - Step 235200, loss: 0.49943202733993535
INFO - Copied model parameters to target network.
INFO - Step 235227, loss: 0.69314444065093996
########## Evaluation ##########
Timestep: 235227 Average reward is -0.0491
INFO - Step 235379, loss: 0.44642522931098947
########## Evaluation ##########
Timestep: 235379 Average reward is -0.0591
INFO - Step 235526, loss: 0.57795912027359015
########## Evaluation ##########
Timestep: 235526 Average reward is -0.0731
INFO - Step 235667, loss: 0.46616333723068244
########## Evaluation ##########
Timestep: 235667 Average reward is -0.0643
INFO - Step 235803, loss: 0.50217616558074956
########## Evaluation ##########
Timestep: 235803 Average reward is -0.0568
INFO - Step 235949, loss: 0.41900503635406494
########## Evaluation ##########
Timestep: 235949 Average reward is -0.0713
INFO - Step 236101, loss: 0.53541088104248053
####

  fig, ax = plt.subplots()


INFO - Step 236537, loss: 0.34395360946655273
########## Evaluation ##########
Timestep: 236537 Average reward is -0.0747
INFO - Step 236699, loss: 0.56945395469665535
########## Evaluation ##########
Timestep: 236699 Average reward is -0.0445
INFO - Step 236851, loss: 0.30481436848640444
########## Evaluation ##########
Timestep: 236851 Average reward is -0.0551
INFO - Step 237000, loss: 0.55340850353240974
########## Evaluation ##########
Timestep: 237000 Average reward is -0.0778
INFO - Step 237141, loss: 0.76714134216308594
########## Evaluation ##########
Timestep: 237141 Average reward is -0.0441
INFO - Step 237200, loss: 0.57270610332489017
INFO - Copied model parameters to target network.
INFO - Step 237287, loss: 0.47908258438110354
########## Evaluation ##########
Timestep: 237287 Average reward is -0.0518
INFO - Step 237417, loss: 0.55989938974380496
########## Evaluation ##########
Timestep: 237417 Average reward is -0.0497
INFO - Step 237564, loss: 0.37406527996063233
####

  fig, ax = plt.subplots()


INFO - Step 238020, loss: 0.56047576665878364
########## Evaluation ##########
Timestep: 238020 Average reward is -0.0578
INFO - Step 238165, loss: 0.60393989086151127
########## Evaluation ##########
Timestep: 238165 Average reward is -0.0678
INFO - Step 238200, loss: 0.48178234696388245
INFO - Copied model parameters to target network.
INFO - Step 238307, loss: 0.46475782990455635
########## Evaluation ##########
Timestep: 238307 Average reward is -0.0798
INFO - Step 238445, loss: 0.56718140840530476
########## Evaluation ##########
Timestep: 238445 Average reward is -0.0624
INFO - Step 238595, loss: 0.38485813140869143
########## Evaluation ##########
Timestep: 238595 Average reward is -0.0497
INFO - Step 238741, loss: 0.64435839653015146
########## Evaluation ##########
Timestep: 238741 Average reward is -0.0698
INFO - Step 238890, loss: 0.74714899063110355
########## Evaluation ##########
Timestep: 238890 Average reward is -0.0623
INFO - Step 239030, loss: 0.49538475275039673
####

  fig, ax = plt.subplots()


INFO - Step 239451, loss: 0.44707351922988893
########## Evaluation ##########
Timestep: 239451 Average reward is -0.06
INFO - Step 239604, loss: 0.50939130783081054
########## Evaluation ##########
Timestep: 239604 Average reward is -0.0365
INFO - Step 239758, loss: 0.49658650159835815
########## Evaluation ##########
Timestep: 239758 Average reward is -0.0482
INFO - Step 239903, loss: 0.53559839725494385
########## Evaluation ##########
Timestep: 239903 Average reward is -0.0414
INFO - Step 240046, loss: 0.61019456386566163
########## Evaluation ##########
Timestep: 240046 Average reward is -0.0662
INFO - Step 240200, loss: 0.47659766674041756
INFO - Copied model parameters to target network.
INFO - Step 240202, loss: 0.72474575042724617
########## Evaluation ##########
Timestep: 240202 Average reward is -0.0657
INFO - Step 240347, loss: 0.52256202697753915
########## Evaluation ##########
Timestep: 240347 Average reward is -0.0457
INFO - Step 240504, loss: 0.59451121091842656
######

  fig, ax = plt.subplots()


INFO - Step 240943, loss: 0.42853152751922617
########## Evaluation ##########
Timestep: 240943 Average reward is -0.0849
INFO - Step 241100, loss: 0.52024006843566987
########## Evaluation ##########
Timestep: 241100 Average reward is -0.0586
INFO - Step 241200, loss: 0.58679056167602544
INFO - Copied model parameters to target network.
INFO - Step 241254, loss: 0.64932036399841317
########## Evaluation ##########
Timestep: 241254 Average reward is -0.0677
INFO - Step 241409, loss: 0.65359914302825935
########## Evaluation ##########
Timestep: 241409 Average reward is -0.0664
INFO - Step 241555, loss: 0.43881541490554814
########## Evaluation ##########
Timestep: 241555 Average reward is -0.0414
INFO - Step 241703, loss: 0.59031236171722415
########## Evaluation ##########
Timestep: 241703 Average reward is -0.069
INFO - Step 241847, loss: 0.54501444101333624
########## Evaluation ##########
Timestep: 241847 Average reward is -0.0693
INFO - Step 241992, loss: 0.48418229818344116
#####

  fig, ax = plt.subplots()


INFO - Step 242444, loss: 0.55400145053863534
########## Evaluation ##########
Timestep: 242444 Average reward is -0.0516
INFO - Step 242590, loss: 0.44635772705078125
########## Evaluation ##########
Timestep: 242590 Average reward is -0.0723
INFO - Step 242741, loss: 0.73618638515472417
########## Evaluation ##########
Timestep: 242741 Average reward is -0.062
INFO - Step 242895, loss: 0.35807877779006965
########## Evaluation ##########
Timestep: 242895 Average reward is -0.064
INFO - Step 243049, loss: 0.43989545106887825
########## Evaluation ##########
Timestep: 243049 Average reward is -0.0638
INFO - Step 243200, loss: 0.47409567236900335
INFO - Copied model parameters to target network.
INFO - Step 243201, loss: 0.43082648515701294
########## Evaluation ##########
Timestep: 243201 Average reward is -0.0769
INFO - Step 243352, loss: 0.39252063632011414
########## Evaluation ##########
Timestep: 243352 Average reward is -0.0544
INFO - Step 243493, loss: 0.64929866790771487
######

  fig, ax = plt.subplots()


INFO - Step 243931, loss: 0.38736754655838016
########## Evaluation ##########
Timestep: 243931 Average reward is -0.047
INFO - Step 244078, loss: 0.36041548848152165
########## Evaluation ##########
Timestep: 244078 Average reward is -0.074
INFO - Step 244200, loss: 0.70809197425842294
INFO - Copied model parameters to target network.
INFO - Step 244226, loss: 0.42048242688179016
########## Evaluation ##########
Timestep: 244226 Average reward is -0.04
INFO - Step 244370, loss: 0.46917828917503357
########## Evaluation ##########
Timestep: 244370 Average reward is -0.0713
INFO - Step 244516, loss: 0.47673201560974123
########## Evaluation ##########
Timestep: 244516 Average reward is -0.0707
INFO - Step 244670, loss: 0.31821700930595446
########## Evaluation ##########
Timestep: 244670 Average reward is -0.0617
INFO - Step 244818, loss: 0.52745580673217776
########## Evaluation ##########
Timestep: 244818 Average reward is -0.0617
INFO - Step 244968, loss: 0.43660733103752136
########

  fig, ax = plt.subplots()


INFO - Step 245409, loss: 0.46171608567237854
########## Evaluation ##########
Timestep: 245409 Average reward is -0.0739
INFO - Step 245564, loss: 0.52430540323257456
########## Evaluation ##########
Timestep: 245564 Average reward is -0.0557
INFO - Step 245715, loss: 0.37454473972320557
########## Evaluation ##########
Timestep: 245715 Average reward is -0.0495
INFO - Step 245865, loss: 0.42587500810623175
########## Evaluation ##########
Timestep: 245865 Average reward is -0.0636
INFO - Step 246028, loss: 0.52546703815460223
########## Evaluation ##########
Timestep: 246028 Average reward is -0.0524
INFO - Step 246167, loss: 0.66174411773681645
########## Evaluation ##########
Timestep: 246167 Average reward is -0.0659
INFO - Step 246200, loss: 0.48000335693359375
INFO - Copied model parameters to target network.
INFO - Step 246314, loss: 0.49467355012893677
########## Evaluation ##########
Timestep: 246314 Average reward is -0.0482
INFO - Step 246460, loss: 0.54584830999374394
####

  fig, ax = plt.subplots()


INFO - Step 246894, loss: 0.49232780933380127
########## Evaluation ##########
Timestep: 246894 Average reward is -0.0649
INFO - Step 247044, loss: 0.58265155553817755
########## Evaluation ##########
Timestep: 247044 Average reward is -0.0789
INFO - Step 247197, loss: 0.65929192304611217
########## Evaluation ##########
Timestep: 247197 Average reward is -0.0514
INFO - Step 247200, loss: 0.6268198490142822
INFO - Copied model parameters to target network.
INFO - Step 247341, loss: 0.51353389024734555
########## Evaluation ##########
Timestep: 247341 Average reward is -0.0768
INFO - Step 247488, loss: 0.40689641237258916
########## Evaluation ##########
Timestep: 247488 Average reward is -0.0696
INFO - Step 247642, loss: 0.50078916549682623
########## Evaluation ##########
Timestep: 247642 Average reward is -0.0685
INFO - Step 247779, loss: 0.49757012724876404
########## Evaluation ##########
Timestep: 247779 Average reward is -0.0646
INFO - Step 247926, loss: 0.72384983301162727
#####

  fig, ax = plt.subplots()


INFO - Step 248386, loss: 0.35528710484504736
########## Evaluation ##########
Timestep: 248386 Average reward is -0.0451
INFO - Step 248531, loss: 0.43851208686828613
########## Evaluation ##########
Timestep: 248531 Average reward is -0.0474
INFO - Step 248687, loss: 0.64208626747131353
########## Evaluation ##########
Timestep: 248687 Average reward is -0.0585
INFO - Step 248836, loss: 0.49381875991821296
########## Evaluation ##########
Timestep: 248836 Average reward is -0.0508
INFO - Step 248982, loss: 0.46859472990036015
########## Evaluation ##########
Timestep: 248982 Average reward is -0.0685
INFO - Step 249112, loss: 0.60016125440597533
########## Evaluation ##########
Timestep: 249112 Average reward is -0.073
INFO - Step 249200, loss: 0.45721855759620667
INFO - Copied model parameters to target network.
INFO - Step 249260, loss: 0.49954921007156376
########## Evaluation ##########
Timestep: 249260 Average reward is -0.0695
INFO - Step 249409, loss: 0.47513163089752243
#####

  fig, ax = plt.subplots()


INFO - Step 249850, loss: 0.56172180175781255
########## Evaluation ##########
Timestep: 249850 Average reward is -0.0572
INFO - Step 249996, loss: 0.42762249708175665
########## Evaluation ##########
Timestep: 249996 Average reward is -0.0567
INFO - Step 250138, loss: 0.42878556251525884
########## Evaluation ##########
Timestep: 250138 Average reward is -0.0687
INFO - Step 250200, loss: 0.47996681928634644
INFO - Copied model parameters to target network.
INFO - Step 250292, loss: 0.46805569529533386
########## Evaluation ##########
Timestep: 250292 Average reward is -0.0676
INFO - Step 250449, loss: 0.49655431509017944
########## Evaluation ##########
Timestep: 250449 Average reward is -0.048
INFO - Step 250600, loss: 0.47574001550674445
########## Evaluation ##########
Timestep: 250600 Average reward is -0.0416
INFO - Step 250755, loss: 0.69848406314849854
########## Evaluation ##########
Timestep: 250755 Average reward is -0.0661
INFO - Step 250902, loss: 0.45766210556030273
#####

  fig, ax = plt.subplots()



INFO - Copied model parameters to target network.
INFO - Step 251324, loss: 0.43616968393325806
########## Evaluation ##########
Timestep: 251324 Average reward is -0.0611
INFO - Step 251465, loss: 0.52205276489257815
########## Evaluation ##########
Timestep: 251465 Average reward is -0.0767
INFO - Step 251619, loss: 0.38358882069587717
########## Evaluation ##########
Timestep: 251619 Average reward is -0.0522
INFO - Step 251763, loss: 0.50435173511505134
########## Evaluation ##########
Timestep: 251763 Average reward is -0.0636
INFO - Step 251904, loss: 0.49627876281738286
########## Evaluation ##########
Timestep: 251904 Average reward is -0.0512
INFO - Step 252051, loss: 0.61211776733398444
########## Evaluation ##########
Timestep: 252051 Average reward is -0.0696
INFO - Step 252191, loss: 0.51758491992950446
########## Evaluation ##########
Timestep: 252191 Average reward is -0.05
INFO - Step 252200, loss: 0.61449307203292856
INFO - Copied model parameters to target network.
I

  fig, ax = plt.subplots()


INFO - Step 252750, loss: 0.44191038608551025
########## Evaluation ##########
Timestep: 252750 Average reward is -0.0693
INFO - Step 252890, loss: 0.58231151103973393
########## Evaluation ##########
Timestep: 252890 Average reward is -0.0714
INFO - Step 253036, loss: 0.42869502305984497
########## Evaluation ##########
Timestep: 253036 Average reward is -0.0465
INFO - Step 253173, loss: 0.61850953102111825
########## Evaluation ##########
Timestep: 253173 Average reward is -0.0454
INFO - Step 253200, loss: 0.41419300436973576
INFO - Copied model parameters to target network.
INFO - Step 253312, loss: 0.53310805559158336
########## Evaluation ##########
Timestep: 253312 Average reward is -0.0373
INFO - Step 253459, loss: 0.49468550086021423
########## Evaluation ##########
Timestep: 253459 Average reward is -0.0579
INFO - Step 253606, loss: 0.79630672931671143
########## Evaluation ##########
Timestep: 253606 Average reward is -0.0544
INFO - Step 253756, loss: 0.50645899772644047
####

  fig, ax = plt.subplots()


INFO - Step 254197, loss: 0.51634907722473147
########## Evaluation ##########
Timestep: 254197 Average reward is -0.0646
INFO - Step 254200, loss: 0.63514548540115365
INFO - Copied model parameters to target network.
INFO - Step 254346, loss: 0.48422265052795413
########## Evaluation ##########
Timestep: 254346 Average reward is -0.0577
INFO - Step 254487, loss: 0.49472558498382573
########## Evaluation ##########
Timestep: 254487 Average reward is -0.0554
INFO - Step 254627, loss: 0.69482719898223883
########## Evaluation ##########
Timestep: 254627 Average reward is -0.0529
INFO - Step 254769, loss: 0.54008078575134283
########## Evaluation ##########
Timestep: 254769 Average reward is -0.0504
INFO - Step 254912, loss: 0.49400132894515994
########## Evaluation ##########
Timestep: 254912 Average reward is -0.069
INFO - Step 255053, loss: 0.88246995210647583
########## Evaluation ##########
Timestep: 255053 Average reward is -0.0569
INFO - Step 255194, loss: 0.60439175367355354
#####

  fig, ax = plt.subplots()


INFO - Step 255617, loss: 0.47358316183090215
########## Evaluation ##########
Timestep: 255617 Average reward is -0.0476
INFO - Step 255763, loss: 0.52957713603973396
########## Evaluation ##########
Timestep: 255763 Average reward is -0.0684
INFO - Step 255913, loss: 0.43359893560409546
########## Evaluation ##########
Timestep: 255913 Average reward is -0.0478
INFO - Step 256069, loss: 0.53206431865692147
########## Evaluation ##########
Timestep: 256069 Average reward is -0.0685
INFO - Step 256200, loss: 0.58277451992034915
INFO - Copied model parameters to target network.
INFO - Step 256218, loss: 0.50521349906921396
########## Evaluation ##########
Timestep: 256218 Average reward is -0.0643
INFO - Step 256383, loss: 0.35499888658523567
########## Evaluation ##########
Timestep: 256383 Average reward is -0.0653
INFO - Step 256532, loss: 0.42994469404220583
########## Evaluation ##########
Timestep: 256532 Average reward is -0.0625
INFO - Step 256684, loss: 0.64005756378173834
####

  fig, ax = plt.subplots()


INFO - Step 257116, loss: 0.43796718120574954
########## Evaluation ##########
Timestep: 257116 Average reward is -0.0689
INFO - Step 257200, loss: 0.46166598796844485
INFO - Copied model parameters to target network.
INFO - Step 257269, loss: 0.41056299209594727
########## Evaluation ##########
Timestep: 257269 Average reward is -0.0476
INFO - Step 257407, loss: 0.40184608101844797
########## Evaluation ##########
Timestep: 257407 Average reward is -0.0471
INFO - Step 257564, loss: 0.56384271383285526
########## Evaluation ##########
Timestep: 257564 Average reward is -0.0636
INFO - Step 257711, loss: 0.69984340667724616
########## Evaluation ##########
Timestep: 257711 Average reward is -0.0571
INFO - Step 257863, loss: 0.58298552036285436
########## Evaluation ##########
Timestep: 257863 Average reward is -0.056
INFO - Step 258008, loss: 0.75285220146179294
########## Evaluation ##########
Timestep: 258008 Average reward is -0.0729
INFO - Step 258151, loss: 0.45130696892738346
#####

  fig, ax = plt.subplots()


INFO - Step 258607, loss: 0.43704426288604736
########## Evaluation ##########
Timestep: 258607 Average reward is -0.0776
INFO - Step 258746, loss: 0.43591484427452094
########## Evaluation ##########
Timestep: 258746 Average reward is -0.0375
INFO - Step 258901, loss: 0.30026602745056153
########## Evaluation ##########
Timestep: 258901 Average reward is -0.0637
INFO - Step 259044, loss: 0.38962239027023315
########## Evaluation ##########
Timestep: 259044 Average reward is -0.0557
INFO - Step 259187, loss: 0.63445603847503666
########## Evaluation ##########
Timestep: 259187 Average reward is -0.0755
INFO - Step 259200, loss: 0.47748127579689026
INFO - Copied model parameters to target network.
INFO - Step 259345, loss: 0.65705895423889163
########## Evaluation ##########
Timestep: 259345 Average reward is -0.0514
INFO - Step 259481, loss: 0.43739068508148193
########## Evaluation ##########
Timestep: 259481 Average reward is -0.0659
INFO - Step 259642, loss: 0.44982707500457764
####

  fig, ax = plt.subplots()


INFO - Step 260087, loss: 0.42515137791633606
########## Evaluation ##########
Timestep: 260087 Average reward is -0.0465
INFO - Step 260200, loss: 0.82706153392791754
INFO - Copied model parameters to target network.
INFO - Step 260239, loss: 0.67931807041168215
########## Evaluation ##########
Timestep: 260239 Average reward is -0.064
INFO - Step 260384, loss: 0.68193471431732183
########## Evaluation ##########
Timestep: 260384 Average reward is -0.066
INFO - Step 260539, loss: 0.70853769779205326
########## Evaluation ##########
Timestep: 260539 Average reward is -0.0764
INFO - Step 260690, loss: 0.55218350887298584
########## Evaluation ##########
Timestep: 260690 Average reward is -0.0726
INFO - Step 260838, loss: 0.49849399924278266
########## Evaluation ##########
Timestep: 260838 Average reward is -0.0538
INFO - Step 260997, loss: 0.47026872634887695
########## Evaluation ##########
Timestep: 260997 Average reward is -0.0489
INFO - Step 261150, loss: 0.41161197423934937
######

  fig, ax = plt.subplots()


INFO - Step 261602, loss: 0.44596388936042786
########## Evaluation ##########
Timestep: 261602 Average reward is -0.0561
INFO - Step 261750, loss: 0.58932518959045415
########## Evaluation ##########
Timestep: 261750 Average reward is -0.0451
INFO - Step 261885, loss: 0.34586730599403386
########## Evaluation ##########
Timestep: 261885 Average reward is -0.0458
INFO - Step 262033, loss: 0.54672539234161387
########## Evaluation ##########
Timestep: 262033 Average reward is -0.0471
INFO - Step 262191, loss: 0.45292428135871887
########## Evaluation ##########
Timestep: 262191 Average reward is -0.052
INFO - Step 262200, loss: 0.35209190845489566
INFO - Copied model parameters to target network.
INFO - Step 262349, loss: 0.58715814352035524
########## Evaluation ##########
Timestep: 262349 Average reward is -0.0643
INFO - Step 262492, loss: 0.50067603588104254
########## Evaluation ##########
Timestep: 262492 Average reward is -0.0472
INFO - Step 262638, loss: 0.37385529279708866
#####

  fig, ax = plt.subplots()


INFO - Step 263071, loss: 0.43479251861572266
########## Evaluation ##########
Timestep: 263071 Average reward is -0.0696
INFO - Step 263200, loss: 0.53853559494018553
INFO - Copied model parameters to target network.
INFO - Step 263229, loss: 0.49380379915237427
########## Evaluation ##########
Timestep: 263229 Average reward is -0.0666
INFO - Step 263388, loss: 0.57788109779357914
########## Evaluation ##########
Timestep: 263388 Average reward is -0.0564
INFO - Step 263537, loss: 0.49405315518379216
########## Evaluation ##########
Timestep: 263537 Average reward is -0.0487
INFO - Step 263689, loss: 0.78693491220474243
########## Evaluation ##########
Timestep: 263689 Average reward is -0.0476
INFO - Step 263840, loss: 0.44214749336242676
########## Evaluation ##########
Timestep: 263840 Average reward is -0.0438
INFO - Step 263995, loss: 0.48931166529655457
########## Evaluation ##########
Timestep: 263995 Average reward is -0.0619
INFO - Step 264152, loss: 0.40267875790596014
####

  fig, ax = plt.subplots()


INFO - Step 264587, loss: 0.56826037168502813
########## Evaluation ##########
Timestep: 264587 Average reward is -0.0518
INFO - Step 264729, loss: 0.56539344787597664
########## Evaluation ##########
Timestep: 264729 Average reward is -0.0629
INFO - Step 264887, loss: 0.49231812357902527
########## Evaluation ##########
Timestep: 264887 Average reward is -0.0663
INFO - Step 265047, loss: 0.43842253088951114
########## Evaluation ##########
Timestep: 265047 Average reward is -0.0691
INFO - Step 265186, loss: 0.36169326305389404
########## Evaluation ##########
Timestep: 265186 Average reward is -0.0458
INFO - Step 265200, loss: 0.55116796493530274
INFO - Copied model parameters to target network.
INFO - Step 265344, loss: 0.45266598463058474
########## Evaluation ##########
Timestep: 265344 Average reward is -0.066
INFO - Step 265494, loss: 0.41340368986129766
########## Evaluation ##########
Timestep: 265494 Average reward is -0.0665
INFO - Step 265651, loss: 0.63567084074020396
#####

  fig, ax = plt.subplots()


INFO - Step 266095, loss: 0.72810822725296024
########## Evaluation ##########
Timestep: 266095 Average reward is -0.0727
INFO - Step 266200, loss: 0.41505494713783264
INFO - Copied model parameters to target network.
INFO - Step 266239, loss: 0.46314740180969245
########## Evaluation ##########
Timestep: 266239 Average reward is -0.0487
INFO - Step 266383, loss: 0.65456998348236087
########## Evaluation ##########
Timestep: 266383 Average reward is -0.0449
INFO - Step 266536, loss: 0.44557952880859375
########## Evaluation ##########
Timestep: 266536 Average reward is -0.0625
INFO - Step 266695, loss: 0.59012520313262945
########## Evaluation ##########
Timestep: 266695 Average reward is -0.0618
INFO - Step 266856, loss: 0.53772389888763436
########## Evaluation ##########
Timestep: 266856 Average reward is -0.0567
INFO - Step 266998, loss: 0.53958237171173177
########## Evaluation ##########
Timestep: 266998 Average reward is -0.0363
INFO - Step 267142, loss: 0.37347877025604257
####

  fig, ax = plt.subplots()


INFO - Step 267575, loss: 0.60450243949890143
########## Evaluation ##########
Timestep: 267575 Average reward is -0.0685
INFO - Step 267726, loss: 0.60750770568847667
########## Evaluation ##########
Timestep: 267726 Average reward is -0.0648
INFO - Step 267878, loss: 0.39137282967567444
########## Evaluation ##########
Timestep: 267878 Average reward is -0.0706
INFO - Step 268023, loss: 0.51648038625717166
########## Evaluation ##########
Timestep: 268023 Average reward is -0.0715
INFO - Step 268168, loss: 0.51846170425415044
########## Evaluation ##########
Timestep: 268168 Average reward is -0.0515
INFO - Step 268200, loss: 0.71759319305419926
INFO - Copied model parameters to target network.
INFO - Step 268333, loss: 0.51899075508117687
########## Evaluation ##########
Timestep: 268333 Average reward is -0.0499
INFO - Step 268489, loss: 0.40102893114089966
########## Evaluation ##########
Timestep: 268489 Average reward is -0.0464
INFO - Step 268638, loss: 0.59915798902511656
####

  fig, ax = plt.subplots()


INFO - Step 269088, loss: 0.63039261102676397
########## Evaluation ##########
Timestep: 269088 Average reward is -0.0475
INFO - Step 269200, loss: 0.54884356260299685
INFO - Copied model parameters to target network.
INFO - Step 269238, loss: 0.57845270633697513
########## Evaluation ##########
Timestep: 269238 Average reward is -0.0528
INFO - Step 269390, loss: 0.59907841682434083
########## Evaluation ##########
Timestep: 269390 Average reward is -0.0564
INFO - Step 269534, loss: 0.30311793088912964
########## Evaluation ##########
Timestep: 269534 Average reward is -0.0467
INFO - Step 269678, loss: 0.40893119573593143
########## Evaluation ##########
Timestep: 269678 Average reward is -0.0513
INFO - Step 269825, loss: 0.35319650173187256
########## Evaluation ##########
Timestep: 269825 Average reward is -0.0373
INFO - Step 269982, loss: 0.67842215299606327
########## Evaluation ##########
Timestep: 269982 Average reward is -0.0641
INFO - Step 270131, loss: 0.55333650112152174
####