In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [2]:
%time hist1 = train_model(env, model, total_episodes=5500)

  logger.deprecation(


episode 1.000000, reward total was -19.000000. running mean: -19.000000
episode 2.000000, reward total was -19.000000. running mean: -19.000000
episode 3.000000, reward total was -20.000000. running mean: -19.010000
episode 4.000000, reward total was -20.000000. running mean: -19.019900
episode 5.000000, reward total was -21.000000. running mean: -19.039701
episode 6.000000, reward total was -18.000000. running mean: -19.029304
episode 7.000000, reward total was -21.000000. running mean: -19.049011
episode 8.000000, reward total was -20.000000. running mean: -19.058521
episode 9.000000, reward total was -18.000000. running mean: -19.047936
episode 10.000000, reward total was -21.000000. running mean: -19.067456
episode 11.000000, reward total was -21.000000. running mean: -19.086782
episode 12.000000, reward total was -21.000000. running mean: -19.105914
episode 13.000000, reward total was -20.000000. running mean: -19.114855
episode 14.000000, reward total was -18.000000. running mean

episode 114.000000, reward total was -20.000000. running mean: -19.946524
episode 115.000000, reward total was -21.000000. running mean: -19.957059
episode 116.000000, reward total was -21.000000. running mean: -19.967488
episode 117.000000, reward total was -20.000000. running mean: -19.967813
episode 118.000000, reward total was -20.000000. running mean: -19.968135
episode 119.000000, reward total was -21.000000. running mean: -19.978454
episode 120.000000, reward total was -20.000000. running mean: -19.978669
episode 121.000000, reward total was -21.000000. running mean: -19.988883
episode 122.000000, reward total was -20.000000. running mean: -19.988994
episode 123.000000, reward total was -17.000000. running mean: -19.959104
episode 124.000000, reward total was -21.000000. running mean: -19.969513
episode 125.000000, reward total was -21.000000. running mean: -19.979818
episode 126.000000, reward total was -19.000000. running mean: -19.970019
episode 127.000000, reward total was -

episode 225.000000, reward total was -20.000000. running mean: -20.128229
episode 226.000000, reward total was -21.000000. running mean: -20.136947
episode 227.000000, reward total was -20.000000. running mean: -20.135577
episode 228.000000, reward total was -21.000000. running mean: -20.144221
episode 229.000000, reward total was -21.000000. running mean: -20.152779
episode 230.000000, reward total was -19.000000. running mean: -20.141251
episode 231.000000, reward total was -21.000000. running mean: -20.149839
episode 232.000000, reward total was -20.000000. running mean: -20.148340
episode 233.000000, reward total was -18.000000. running mean: -20.126857
episode 234.000000, reward total was -19.000000. running mean: -20.115588
episode 235.000000, reward total was -21.000000. running mean: -20.124433
episode 236.000000, reward total was -21.000000. running mean: -20.133188
episode 237.000000, reward total was -20.000000. running mean: -20.131856
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -19.927204
episode 337.000000, reward total was -20.000000. running mean: -19.927932
episode 338.000000, reward total was -19.000000. running mean: -19.918653
episode 339.000000, reward total was -19.000000. running mean: -19.909467
episode 340.000000, reward total was -19.000000. running mean: -19.900372
episode 341.000000, reward total was -20.000000. running mean: -19.901368
episode 342.000000, reward total was -21.000000. running mean: -19.912355
episode 343.000000, reward total was -18.000000. running mean: -19.893231
episode 344.000000, reward total was -19.000000. running mean: -19.884299
episode 345.000000, reward total was -20.000000. running mean: -19.885456
episode 346.000000, reward total was -19.000000. running mean: -19.876601
episode 347.000000, reward total was -18.000000. running mean: -19.857835
episode 348.000000, reward total was -20.000000. running mean: -19.859257
episode 349.000000, reward total was -

episode 447.000000, reward total was -21.000000. running mean: -19.807681
episode 448.000000, reward total was -17.000000. running mean: -19.779604
episode 449.000000, reward total was -20.000000. running mean: -19.781808
episode 450.000000, reward total was -21.000000. running mean: -19.793990
episode 451.000000, reward total was -21.000000. running mean: -19.806050
episode 452.000000, reward total was -21.000000. running mean: -19.817989
episode 453.000000, reward total was -21.000000. running mean: -19.829809
episode 454.000000, reward total was -18.000000. running mean: -19.811511
episode 455.000000, reward total was -19.000000. running mean: -19.803396
episode 456.000000, reward total was -20.000000. running mean: -19.805362
episode 457.000000, reward total was -18.000000. running mean: -19.787309
episode 458.000000, reward total was -20.000000. running mean: -19.789435
episode 459.000000, reward total was -21.000000. running mean: -19.801541
episode 460.000000, reward total was -

episode 558.000000, reward total was -20.000000. running mean: -19.460260
episode 559.000000, reward total was -20.000000. running mean: -19.465657
episode 560.000000, reward total was -19.000000. running mean: -19.461001
episode 561.000000, reward total was -21.000000. running mean: -19.476391
episode 562.000000, reward total was -17.000000. running mean: -19.451627
episode 563.000000, reward total was -19.000000. running mean: -19.447111
episode 564.000000, reward total was -18.000000. running mean: -19.432640
episode 565.000000, reward total was -19.000000. running mean: -19.428313
episode 566.000000, reward total was -21.000000. running mean: -19.444030
episode 567.000000, reward total was -19.000000. running mean: -19.439590
episode 568.000000, reward total was -20.000000. running mean: -19.445194
episode 569.000000, reward total was -18.000000. running mean: -19.430742
episode 570.000000, reward total was -17.000000. running mean: -19.406435
episode 571.000000, reward total was -

episode 669.000000, reward total was -20.000000. running mean: -18.943866
episode 670.000000, reward total was -19.000000. running mean: -18.944428
episode 671.000000, reward total was -13.000000. running mean: -18.884983
episode 672.000000, reward total was -20.000000. running mean: -18.896133
episode 673.000000, reward total was -20.000000. running mean: -18.907172
episode 674.000000, reward total was -20.000000. running mean: -18.918100
episode 675.000000, reward total was -21.000000. running mean: -18.938919
episode 676.000000, reward total was -19.000000. running mean: -18.939530
episode 677.000000, reward total was -19.000000. running mean: -18.940135
episode 678.000000, reward total was -19.000000. running mean: -18.940734
episode 679.000000, reward total was -17.000000. running mean: -18.921326
episode 680.000000, reward total was -18.000000. running mean: -18.912113
episode 681.000000, reward total was -21.000000. running mean: -18.932992
episode 682.000000, reward total was -

episode 780.000000, reward total was -20.000000. running mean: -18.442251
episode 781.000000, reward total was -21.000000. running mean: -18.467828
episode 782.000000, reward total was -15.000000. running mean: -18.433150
episode 783.000000, reward total was -17.000000. running mean: -18.418818
episode 784.000000, reward total was -18.000000. running mean: -18.414630
episode 785.000000, reward total was -19.000000. running mean: -18.420484
episode 786.000000, reward total was -19.000000. running mean: -18.426279
episode 787.000000, reward total was -15.000000. running mean: -18.392016
episode 788.000000, reward total was -20.000000. running mean: -18.408096
episode 789.000000, reward total was -19.000000. running mean: -18.414015
episode 790.000000, reward total was -21.000000. running mean: -18.439875
episode 791.000000, reward total was -19.000000. running mean: -18.445476
episode 792.000000, reward total was -19.000000. running mean: -18.451021
episode 793.000000, reward total was -

episode 891.000000, reward total was -14.000000. running mean: -17.563386
episode 892.000000, reward total was -19.000000. running mean: -17.577752
episode 893.000000, reward total was -17.000000. running mean: -17.571975
episode 894.000000, reward total was -17.000000. running mean: -17.566255
episode 895.000000, reward total was -19.000000. running mean: -17.580592
episode 896.000000, reward total was -15.000000. running mean: -17.554786
episode 897.000000, reward total was -20.000000. running mean: -17.579239
episode 898.000000, reward total was -15.000000. running mean: -17.553446
episode 899.000000, reward total was -15.000000. running mean: -17.527912
episode 900.000000, reward total was -19.000000. running mean: -17.542633
episode 901.000000, reward total was -19.000000. running mean: -17.557206
episode 902.000000, reward total was -13.000000. running mean: -17.511634
episode 903.000000, reward total was -17.000000. running mean: -17.506518
episode 904.000000, reward total was -

episode 1002.000000, reward total was -15.000000. running mean: -16.980045
episode 1003.000000, reward total was -11.000000. running mean: -16.920244
episode 1004.000000, reward total was -18.000000. running mean: -16.931042
episode 1005.000000, reward total was -19.000000. running mean: -16.951732
episode 1006.000000, reward total was -19.000000. running mean: -16.972214
episode 1007.000000, reward total was -20.000000. running mean: -17.002492
episode 1008.000000, reward total was -19.000000. running mean: -17.022467
episode 1009.000000, reward total was -20.000000. running mean: -17.052242
episode 1010.000000, reward total was -19.000000. running mean: -17.071720
episode 1011.000000, reward total was -17.000000. running mean: -17.071003
episode 1012.000000, reward total was -15.000000. running mean: -17.050293
episode 1013.000000, reward total was -19.000000. running mean: -17.069790
episode 1014.000000, reward total was -14.000000. running mean: -17.039092
episode 1015.000000, rewa

episode 1112.000000, reward total was -17.000000. running mean: -16.111064
episode 1113.000000, reward total was -15.000000. running mean: -16.099953
episode 1114.000000, reward total was -15.000000. running mean: -16.088953
episode 1115.000000, reward total was -17.000000. running mean: -16.098064
episode 1116.000000, reward total was -9.000000. running mean: -16.027083
episode 1117.000000, reward total was -18.000000. running mean: -16.046812
episode 1118.000000, reward total was -11.000000. running mean: -15.996344
episode 1119.000000, reward total was -18.000000. running mean: -16.016381
episode 1120.000000, reward total was -17.000000. running mean: -16.026217
episode 1121.000000, reward total was -17.000000. running mean: -16.035955
episode 1122.000000, reward total was -16.000000. running mean: -16.035595
episode 1123.000000, reward total was -17.000000. running mean: -16.045239
episode 1124.000000, reward total was -19.000000. running mean: -16.074787
episode 1125.000000, rewar

episode 1222.000000, reward total was -19.000000. running mean: -15.490365
episode 1223.000000, reward total was -9.000000. running mean: -15.425461
episode 1224.000000, reward total was -14.000000. running mean: -15.411207
episode 1225.000000, reward total was -11.000000. running mean: -15.367094
episode 1226.000000, reward total was -16.000000. running mean: -15.373424
episode 1227.000000, reward total was -13.000000. running mean: -15.349689
episode 1228.000000, reward total was -16.000000. running mean: -15.356192
episode 1229.000000, reward total was -14.000000. running mean: -15.342630
episode 1230.000000, reward total was -13.000000. running mean: -15.319204
episode 1231.000000, reward total was -13.000000. running mean: -15.296012
episode 1232.000000, reward total was -15.000000. running mean: -15.293052
episode 1233.000000, reward total was -17.000000. running mean: -15.310122
episode 1234.000000, reward total was -16.000000. running mean: -15.317020
episode 1235.000000, rewar

episode 1332.000000, reward total was -14.000000. running mean: -14.543859
episode 1333.000000, reward total was -17.000000. running mean: -14.568421
episode 1334.000000, reward total was -17.000000. running mean: -14.592737
episode 1335.000000, reward total was -9.000000. running mean: -14.536809
episode 1336.000000, reward total was -17.000000. running mean: -14.561441
episode 1337.000000, reward total was -14.000000. running mean: -14.555827
episode 1338.000000, reward total was -16.000000. running mean: -14.570268
episode 1339.000000, reward total was -16.000000. running mean: -14.584566
episode 1340.000000, reward total was -19.000000. running mean: -14.628720
episode 1341.000000, reward total was -11.000000. running mean: -14.592433
episode 1342.000000, reward total was -14.000000. running mean: -14.586509
episode 1343.000000, reward total was -15.000000. running mean: -14.590643
episode 1344.000000, reward total was -13.000000. running mean: -14.574737
episode 1345.000000, rewar

episode 1442.000000, reward total was -9.000000. running mean: -13.839597
episode 1443.000000, reward total was -19.000000. running mean: -13.891201
episode 1444.000000, reward total was -15.000000. running mean: -13.902289
episode 1445.000000, reward total was -12.000000. running mean: -13.883266
episode 1446.000000, reward total was -13.000000. running mean: -13.874434
episode 1447.000000, reward total was -15.000000. running mean: -13.885689
episode 1448.000000, reward total was -16.000000. running mean: -13.906832
episode 1449.000000, reward total was -15.000000. running mean: -13.917764
episode 1450.000000, reward total was -13.000000. running mean: -13.908586
episode 1451.000000, reward total was -12.000000. running mean: -13.889501
episode 1452.000000, reward total was -6.000000. running mean: -13.810606
episode 1453.000000, reward total was -17.000000. running mean: -13.842500
episode 1454.000000, reward total was -16.000000. running mean: -13.864075
episode 1455.000000, reward

episode 1552.000000, reward total was -11.000000. running mean: -13.284502
episode 1553.000000, reward total was -10.000000. running mean: -13.251657
episode 1554.000000, reward total was -13.000000. running mean: -13.249140
episode 1555.000000, reward total was -11.000000. running mean: -13.226649
episode 1556.000000, reward total was -19.000000. running mean: -13.284382
episode 1557.000000, reward total was -13.000000. running mean: -13.281538
episode 1558.000000, reward total was -12.000000. running mean: -13.268723
episode 1559.000000, reward total was -11.000000. running mean: -13.246036
episode 1560.000000, reward total was -12.000000. running mean: -13.233575
episode 1561.000000, reward total was -17.000000. running mean: -13.271240
episode 1562.000000, reward total was -11.000000. running mean: -13.248527
episode 1563.000000, reward total was -14.000000. running mean: -13.256042
episode 1564.000000, reward total was -17.000000. running mean: -13.293482
episode 1565.000000, rewa

episode 1662.000000, reward total was -14.000000. running mean: -12.720180
episode 1663.000000, reward total was -15.000000. running mean: -12.742978
episode 1664.000000, reward total was -13.000000. running mean: -12.745548
episode 1665.000000, reward total was -13.000000. running mean: -12.748093
episode 1666.000000, reward total was -11.000000. running mean: -12.730612
episode 1667.000000, reward total was -17.000000. running mean: -12.773306
episode 1668.000000, reward total was -13.000000. running mean: -12.775573
episode 1669.000000, reward total was -13.000000. running mean: -12.777817
episode 1670.000000, reward total was -17.000000. running mean: -12.820039
episode 1671.000000, reward total was -14.000000. running mean: -12.831838
episode 1672.000000, reward total was -12.000000. running mean: -12.823520
episode 1673.000000, reward total was -13.000000. running mean: -12.825285
episode 1674.000000, reward total was -11.000000. running mean: -12.807032
episode 1675.000000, rewa

episode 1772.000000, reward total was -11.000000. running mean: -12.257928
episode 1773.000000, reward total was -15.000000. running mean: -12.285349
episode 1774.000000, reward total was -11.000000. running mean: -12.272495
episode 1775.000000, reward total was -13.000000. running mean: -12.279770
episode 1776.000000, reward total was -15.000000. running mean: -12.306972
episode 1777.000000, reward total was -11.000000. running mean: -12.293903
episode 1778.000000, reward total was -11.000000. running mean: -12.280964
episode 1779.000000, reward total was -11.000000. running mean: -12.268154
episode 1780.000000, reward total was -12.000000. running mean: -12.265473
episode 1781.000000, reward total was -10.000000. running mean: -12.242818
episode 1782.000000, reward total was -11.000000. running mean: -12.230390
episode 1783.000000, reward total was -17.000000. running mean: -12.278086
episode 1784.000000, reward total was -15.000000. running mean: -12.305305
episode 1785.000000, rewa

episode 1882.000000, reward total was -7.000000. running mean: -11.775752
episode 1883.000000, reward total was -15.000000. running mean: -11.807995
episode 1884.000000, reward total was -16.000000. running mean: -11.849915
episode 1885.000000, reward total was -15.000000. running mean: -11.881416
episode 1886.000000, reward total was -15.000000. running mean: -11.912602
episode 1887.000000, reward total was -18.000000. running mean: -11.973476
episode 1888.000000, reward total was -13.000000. running mean: -11.983741
episode 1889.000000, reward total was -10.000000. running mean: -11.963903
episode 1890.000000, reward total was -10.000000. running mean: -11.944264
episode 1891.000000, reward total was -16.000000. running mean: -11.984822
episode 1892.000000, reward total was -6.000000. running mean: -11.924974
episode 1893.000000, reward total was -3.000000. running mean: -11.835724
episode 1894.000000, reward total was -15.000000. running mean: -11.867367
episode 1895.000000, reward 

episode 1992.000000, reward total was -14.000000. running mean: -11.689554
episode 1993.000000, reward total was -12.000000. running mean: -11.692658
episode 1994.000000, reward total was -12.000000. running mean: -11.695732
episode 1995.000000, reward total was -11.000000. running mean: -11.688774
episode 1996.000000, reward total was -14.000000. running mean: -11.711886
episode 1997.000000, reward total was -11.000000. running mean: -11.704768
episode 1998.000000, reward total was -13.000000. running mean: -11.717720
episode 1999.000000, reward total was -10.000000. running mean: -11.700543
episode 2000.000000, reward total was -13.000000. running mean: -11.713537
episode 2001.000000, reward total was -15.000000. running mean: -11.746402
episode 2002.000000, reward total was -19.000000. running mean: -11.818938
episode 2003.000000, reward total was -10.000000. running mean: -11.800749
episode 2004.000000, reward total was -13.000000. running mean: -11.812741
episode 2005.000000, rewa

episode 2102.000000, reward total was -15.000000. running mean: -11.810890
episode 2103.000000, reward total was -8.000000. running mean: -11.772781
episode 2104.000000, reward total was -7.000000. running mean: -11.725053
episode 2105.000000, reward total was -14.000000. running mean: -11.747803
episode 2106.000000, reward total was -10.000000. running mean: -11.730324
episode 2107.000000, reward total was -8.000000. running mean: -11.693021
episode 2108.000000, reward total was -10.000000. running mean: -11.676091
episode 2109.000000, reward total was -8.000000. running mean: -11.639330
episode 2110.000000, reward total was -11.000000. running mean: -11.632937
episode 2111.000000, reward total was -6.000000. running mean: -11.576607
episode 2112.000000, reward total was -15.000000. running mean: -11.610841
episode 2113.000000, reward total was -18.000000. running mean: -11.674733
episode 2114.000000, reward total was -17.000000. running mean: -11.727986
episode 2115.000000, reward to

episode 2212.000000, reward total was -15.000000. running mean: -11.299034
episode 2213.000000, reward total was -11.000000. running mean: -11.296044
episode 2214.000000, reward total was -13.000000. running mean: -11.313083
episode 2215.000000, reward total was -12.000000. running mean: -11.319952
episode 2216.000000, reward total was -9.000000. running mean: -11.296753
episode 2217.000000, reward total was -13.000000. running mean: -11.313785
episode 2218.000000, reward total was -15.000000. running mean: -11.350647
episode 2219.000000, reward total was -9.000000. running mean: -11.327141
episode 2220.000000, reward total was -18.000000. running mean: -11.393870
episode 2221.000000, reward total was -10.000000. running mean: -11.379931
episode 2222.000000, reward total was -17.000000. running mean: -11.436132
episode 2223.000000, reward total was -11.000000. running mean: -11.431770
episode 2224.000000, reward total was -17.000000. running mean: -11.487452
episode 2225.000000, reward

episode 2322.000000, reward total was -8.000000. running mean: -10.226632
episode 2323.000000, reward total was -15.000000. running mean: -10.274365
episode 2324.000000, reward total was -13.000000. running mean: -10.301622
episode 2325.000000, reward total was -12.000000. running mean: -10.318605
episode 2326.000000, reward total was -12.000000. running mean: -10.335419
episode 2327.000000, reward total was -12.000000. running mean: -10.352065
episode 2328.000000, reward total was -15.000000. running mean: -10.398544
episode 2329.000000, reward total was -4.000000. running mean: -10.334559
episode 2330.000000, reward total was -13.000000. running mean: -10.361213
episode 2331.000000, reward total was -6.000000. running mean: -10.317601
episode 2332.000000, reward total was -10.000000. running mean: -10.314425
episode 2333.000000, reward total was -5.000000. running mean: -10.261281
episode 2334.000000, reward total was -7.000000. running mean: -10.228668
episode 2335.000000, reward to

episode 2433.000000, reward total was -11.000000. running mean: -10.394698
episode 2434.000000, reward total was -7.000000. running mean: -10.360751
episode 2435.000000, reward total was -12.000000. running mean: -10.377144
episode 2436.000000, reward total was -13.000000. running mean: -10.403372
episode 2437.000000, reward total was -12.000000. running mean: -10.419338
episode 2438.000000, reward total was -15.000000. running mean: -10.465145
episode 2439.000000, reward total was -6.000000. running mean: -10.420494
episode 2440.000000, reward total was -11.000000. running mean: -10.426289
episode 2441.000000, reward total was -16.000000. running mean: -10.482026
episode 2442.000000, reward total was -8.000000. running mean: -10.457206
episode 2443.000000, reward total was -13.000000. running mean: -10.482634
episode 2444.000000, reward total was -11.000000. running mean: -10.487807
episode 2445.000000, reward total was -13.000000. running mean: -10.512929
episode 2446.000000, reward 

episode 2544.000000, reward total was -8.000000. running mean: -9.716122
episode 2545.000000, reward total was -14.000000. running mean: -9.758960
episode 2546.000000, reward total was -18.000000. running mean: -9.841371
episode 2547.000000, reward total was -10.000000. running mean: -9.842957
episode 2548.000000, reward total was -11.000000. running mean: -9.854527
episode 2549.000000, reward total was -13.000000. running mean: -9.885982
episode 2550.000000, reward total was -12.000000. running mean: -9.907122
episode 2551.000000, reward total was -15.000000. running mean: -9.958051
episode 2552.000000, reward total was -15.000000. running mean: -10.008471
episode 2553.000000, reward total was -5.000000. running mean: -9.958386
episode 2554.000000, reward total was -6.000000. running mean: -9.918802
episode 2555.000000, reward total was -11.000000. running mean: -9.929614
episode 2556.000000, reward total was -17.000000. running mean: -10.000318
episode 2557.000000, reward total was -

episode 2656.000000, reward total was -10.000000. running mean: -9.923593
episode 2657.000000, reward total was -9.000000. running mean: -9.914357
episode 2658.000000, reward total was -14.000000. running mean: -9.955214
episode 2659.000000, reward total was -9.000000. running mean: -9.945662
episode 2660.000000, reward total was -12.000000. running mean: -9.966205
episode 2661.000000, reward total was -4.000000. running mean: -9.906543
episode 2662.000000, reward total was -16.000000. running mean: -9.967478
episode 2663.000000, reward total was -7.000000. running mean: -9.937803
episode 2664.000000, reward total was -11.000000. running mean: -9.948425
episode 2665.000000, reward total was -11.000000. running mean: -9.958941
episode 2666.000000, reward total was -8.000000. running mean: -9.939351
episode 2667.000000, reward total was -16.000000. running mean: -9.999958
episode 2668.000000, reward total was -6.000000. running mean: -9.959958
episode 2669.000000, reward total was -6.000

episode 2768.000000, reward total was -13.000000. running mean: -9.305555
episode 2769.000000, reward total was -15.000000. running mean: -9.362500
episode 2770.000000, reward total was -17.000000. running mean: -9.438875
episode 2771.000000, reward total was 2.000000. running mean: -9.324486
episode 2772.000000, reward total was -9.000000. running mean: -9.321241
episode 2773.000000, reward total was -14.000000. running mean: -9.368029
episode 2774.000000, reward total was -10.000000. running mean: -9.374348
episode 2775.000000, reward total was -13.000000. running mean: -9.410605
episode 2776.000000, reward total was -9.000000. running mean: -9.406499
episode 2777.000000, reward total was -1.000000. running mean: -9.322434
episode 2778.000000, reward total was -11.000000. running mean: -9.339209
episode 2779.000000, reward total was -3.000000. running mean: -9.275817
episode 2780.000000, reward total was -10.000000. running mean: -9.283059
episode 2781.000000, reward total was -13.00

episode 2880.000000, reward total was -4.000000. running mean: -9.552570
episode 2881.000000, reward total was -13.000000. running mean: -9.587044
episode 2882.000000, reward total was -11.000000. running mean: -9.601174
episode 2883.000000, reward total was -7.000000. running mean: -9.575162
episode 2884.000000, reward total was -7.000000. running mean: -9.549410
episode 2885.000000, reward total was -14.000000. running mean: -9.593916
episode 2886.000000, reward total was 2.000000. running mean: -9.477977
episode 2887.000000, reward total was -8.000000. running mean: -9.463197
episode 2888.000000, reward total was -8.000000. running mean: -9.448565
episode 2889.000000, reward total was -6.000000. running mean: -9.414080
episode 2890.000000, reward total was -11.000000. running mean: -9.429939
episode 2891.000000, reward total was 3.000000. running mean: -9.305639
episode 2892.000000, reward total was -15.000000. running mean: -9.362583
episode 2893.000000, reward total was -15.000000

episode 2992.000000, reward total was -9.000000. running mean: -8.889273
episode 2993.000000, reward total was -6.000000. running mean: -8.860381
episode 2994.000000, reward total was -11.000000. running mean: -8.881777
episode 2995.000000, reward total was -8.000000. running mean: -8.872959
episode 2996.000000, reward total was -10.000000. running mean: -8.884229
episode 2997.000000, reward total was -3.000000. running mean: -8.825387
episode 2998.000000, reward total was -1.000000. running mean: -8.747133
episode 2999.000000, reward total was -12.000000. running mean: -8.779662
episode 3000.000000, reward total was -9.000000. running mean: -8.781865
episode 3001.000000, reward total was -4.000000. running mean: -8.734047
episode 3002.000000, reward total was -11.000000. running mean: -8.756706
episode 3003.000000, reward total was -16.000000. running mean: -8.829139
episode 3004.000000, reward total was -15.000000. running mean: -8.890848
episode 3005.000000, reward total was -11.000

episode 3104.000000, reward total was -6.000000. running mean: -8.701533
episode 3105.000000, reward total was -17.000000. running mean: -8.784518
episode 3106.000000, reward total was -13.000000. running mean: -8.826673
episode 3107.000000, reward total was -9.000000. running mean: -8.828406
episode 3108.000000, reward total was -9.000000. running mean: -8.830122
episode 3109.000000, reward total was -9.000000. running mean: -8.831821
episode 3110.000000, reward total was -3.000000. running mean: -8.773502
episode 3111.000000, reward total was -2.000000. running mean: -8.705767
episode 3112.000000, reward total was -8.000000. running mean: -8.698710
episode 3113.000000, reward total was -13.000000. running mean: -8.741723
episode 3114.000000, reward total was -10.000000. running mean: -8.754305
episode 3115.000000, reward total was -6.000000. running mean: -8.726762
episode 3116.000000, reward total was -5.000000. running mean: -8.689495
episode 3117.000000, reward total was 9.000000.

episode 3216.000000, reward total was -11.000000. running mean: -8.292206
episode 3217.000000, reward total was -11.000000. running mean: -8.319284
episode 3218.000000, reward total was -9.000000. running mean: -8.326091
episode 3219.000000, reward total was -7.000000. running mean: -8.312830
episode 3220.000000, reward total was -3.000000. running mean: -8.259702
episode 3221.000000, reward total was -7.000000. running mean: -8.247105
episode 3222.000000, reward total was -8.000000. running mean: -8.244634
episode 3223.000000, reward total was -6.000000. running mean: -8.222187
episode 3224.000000, reward total was -1.000000. running mean: -8.149965
episode 3225.000000, reward total was -12.000000. running mean: -8.188466
episode 3226.000000, reward total was -2.000000. running mean: -8.126581
episode 3227.000000, reward total was -15.000000. running mean: -8.195315
episode 3228.000000, reward total was -8.000000. running mean: -8.193362
episode 3229.000000, reward total was -12.00000

episode 3328.000000, reward total was -2.000000. running mean: -7.638223
episode 3329.000000, reward total was -2.000000. running mean: -7.581841
episode 3330.000000, reward total was -11.000000. running mean: -7.616023
episode 3331.000000, reward total was -8.000000. running mean: -7.619862
episode 3332.000000, reward total was -14.000000. running mean: -7.683664
episode 3333.000000, reward total was 2.000000. running mean: -7.586827
episode 3334.000000, reward total was -4.000000. running mean: -7.550959
episode 3335.000000, reward total was -11.000000. running mean: -7.585449
episode 3336.000000, reward total was -12.000000. running mean: -7.629595
episode 3337.000000, reward total was -10.000000. running mean: -7.653299
episode 3338.000000, reward total was -15.000000. running mean: -7.726766
episode 3339.000000, reward total was 2.000000. running mean: -7.629498
episode 3340.000000, reward total was 6.000000. running mean: -7.493203
episode 3341.000000, reward total was -9.000000.

episode 3440.000000, reward total was -11.000000. running mean: -7.066341
episode 3441.000000, reward total was -2.000000. running mean: -7.015678
episode 3442.000000, reward total was -10.000000. running mean: -7.045521
episode 3443.000000, reward total was -9.000000. running mean: -7.065066
episode 3444.000000, reward total was -11.000000. running mean: -7.104415
episode 3445.000000, reward total was -11.000000. running mean: -7.143371
episode 3446.000000, reward total was 1.000000. running mean: -7.061937
episode 3447.000000, reward total was -11.000000. running mean: -7.101318
episode 3448.000000, reward total was -7.000000. running mean: -7.100305
episode 3449.000000, reward total was -11.000000. running mean: -7.139302
episode 3450.000000, reward total was -12.000000. running mean: -7.187909
episode 3451.000000, reward total was -10.000000. running mean: -7.216030
episode 3452.000000, reward total was -4.000000. running mean: -7.183869
episode 3453.000000, reward total was -5.000

episode 3552.000000, reward total was -7.000000. running mean: -7.112506
episode 3553.000000, reward total was -13.000000. running mean: -7.171381
episode 3554.000000, reward total was -2.000000. running mean: -7.119668
episode 3555.000000, reward total was -13.000000. running mean: -7.178471
episode 3556.000000, reward total was -9.000000. running mean: -7.196686
episode 3557.000000, reward total was -6.000000. running mean: -7.184719
episode 3558.000000, reward total was -6.000000. running mean: -7.172872
episode 3559.000000, reward total was 3.000000. running mean: -7.071143
episode 3560.000000, reward total was -14.000000. running mean: -7.140432
episode 3561.000000, reward total was -8.000000. running mean: -7.149028
episode 3562.000000, reward total was -10.000000. running mean: -7.177537
episode 3563.000000, reward total was -13.000000. running mean: -7.235762
episode 3564.000000, reward total was -6.000000. running mean: -7.223404
episode 3565.000000, reward total was -2.000000

episode 3664.000000, reward total was -5.000000. running mean: -6.769236
episode 3665.000000, reward total was -11.000000. running mean: -6.811544
episode 3666.000000, reward total was -5.000000. running mean: -6.793429
episode 3667.000000, reward total was -4.000000. running mean: -6.765494
episode 3668.000000, reward total was -12.000000. running mean: -6.817839
episode 3669.000000, reward total was -17.000000. running mean: -6.919661
episode 3670.000000, reward total was -4.000000. running mean: -6.890464
episode 3671.000000, reward total was -8.000000. running mean: -6.901560
episode 3672.000000, reward total was -8.000000. running mean: -6.912544
episode 3673.000000, reward total was -6.000000. running mean: -6.903419
episode 3674.000000, reward total was -4.000000. running mean: -6.874384
episode 3675.000000, reward total was -7.000000. running mean: -6.875641
episode 3676.000000, reward total was -10.000000. running mean: -6.906884
episode 3677.000000, reward total was 1.000000.

episode 3776.000000, reward total was -8.000000. running mean: -7.361994
episode 3777.000000, reward total was -6.000000. running mean: -7.348374
episode 3778.000000, reward total was 7.000000. running mean: -7.204891
episode 3779.000000, reward total was 3.000000. running mean: -7.102842
episode 3780.000000, reward total was -5.000000. running mean: -7.081813
episode 3781.000000, reward total was -7.000000. running mean: -7.080995
episode 3782.000000, reward total was -7.000000. running mean: -7.080185
episode 3783.000000, reward total was -9.000000. running mean: -7.099383
episode 3784.000000, reward total was -14.000000. running mean: -7.168389
episode 3785.000000, reward total was -4.000000. running mean: -7.136706
episode 3786.000000, reward total was -10.000000. running mean: -7.165339
episode 3787.000000, reward total was -6.000000. running mean: -7.153685
episode 3788.000000, reward total was 1.000000. running mean: -7.072148
episode 3789.000000, reward total was -11.000000. ru

episode 3889.000000, reward total was 5.000000. running mean: -6.433687
episode 3890.000000, reward total was -11.000000. running mean: -6.479350
episode 3891.000000, reward total was -11.000000. running mean: -6.524556
episode 3892.000000, reward total was -9.000000. running mean: -6.549311
episode 3893.000000, reward total was 6.000000. running mean: -6.423817
episode 3894.000000, reward total was -3.000000. running mean: -6.389579
episode 3895.000000, reward total was -9.000000. running mean: -6.415684
episode 3896.000000, reward total was -11.000000. running mean: -6.461527
episode 3897.000000, reward total was -2.000000. running mean: -6.416911
episode 3898.000000, reward total was -2.000000. running mean: -6.372742
episode 3899.000000, reward total was -5.000000. running mean: -6.359015
episode 3900.000000, reward total was -9.000000. running mean: -6.385425
episode 3901.000000, reward total was -7.000000. running mean: -6.391570
episode 3902.000000, reward total was -6.000000. r

episode 4002.000000, reward total was -5.000000. running mean: -5.893689
episode 4003.000000, reward total was -4.000000. running mean: -5.874752
episode 4004.000000, reward total was -8.000000. running mean: -5.896004
episode 4005.000000, reward total was -15.000000. running mean: -5.987044
episode 4006.000000, reward total was -12.000000. running mean: -6.047174
episode 4007.000000, reward total was -8.000000. running mean: -6.066702
episode 4008.000000, reward total was -10.000000. running mean: -6.106035
episode 4009.000000, reward total was -5.000000. running mean: -6.094975
episode 4010.000000, reward total was -11.000000. running mean: -6.144025
episode 4011.000000, reward total was -15.000000. running mean: -6.232585
episode 4012.000000, reward total was -9.000000. running mean: -6.260259
episode 4013.000000, reward total was 6.000000. running mean: -6.137656
episode 4014.000000, reward total was -11.000000. running mean: -6.186280
episode 4015.000000, reward total was -4.00000

episode 4115.000000, reward total was -4.000000. running mean: -5.450505
episode 4116.000000, reward total was -5.000000. running mean: -5.446000
episode 4117.000000, reward total was -3.000000. running mean: -5.421540
episode 4118.000000, reward total was -13.000000. running mean: -5.497324
episode 4119.000000, reward total was -13.000000. running mean: -5.572351
episode 4120.000000, reward total was -13.000000. running mean: -5.646627
episode 4121.000000, reward total was -10.000000. running mean: -5.690161
episode 4122.000000, reward total was -12.000000. running mean: -5.753260
episode 4123.000000, reward total was -12.000000. running mean: -5.815727
episode 4124.000000, reward total was -3.000000. running mean: -5.787570
episode 4125.000000, reward total was -11.000000. running mean: -5.839694
episode 4126.000000, reward total was 1.000000. running mean: -5.771297
episode 4127.000000, reward total was -1.000000. running mean: -5.723584
episode 4128.000000, reward total was -10.000

episode 4228.000000, reward total was -7.000000. running mean: -5.161538
episode 4229.000000, reward total was -13.000000. running mean: -5.239923
episode 4230.000000, reward total was -5.000000. running mean: -5.237523
episode 4231.000000, reward total was -2.000000. running mean: -5.205148
episode 4232.000000, reward total was -6.000000. running mean: -5.213097
episode 4233.000000, reward total was -10.000000. running mean: -5.260966
episode 4234.000000, reward total was -12.000000. running mean: -5.328356
episode 4235.000000, reward total was -1.000000. running mean: -5.285073
episode 4236.000000, reward total was -13.000000. running mean: -5.362222
episode 4237.000000, reward total was -3.000000. running mean: -5.338600
episode 4238.000000, reward total was 6.000000. running mean: -5.225214
episode 4239.000000, reward total was -6.000000. running mean: -5.232961
episode 4240.000000, reward total was -10.000000. running mean: -5.280632
episode 4241.000000, reward total was -5.000000

episode 4341.000000, reward total was -6.000000. running mean: -5.243355
episode 4342.000000, reward total was -9.000000. running mean: -5.280922
episode 4343.000000, reward total was 3.000000. running mean: -5.198112
episode 4344.000000, reward total was -3.000000. running mean: -5.176131
episode 4345.000000, reward total was -2.000000. running mean: -5.144370
episode 4346.000000, reward total was -1.000000. running mean: -5.102926
episode 4347.000000, reward total was -14.000000. running mean: -5.191897
episode 4348.000000, reward total was -3.000000. running mean: -5.169978
episode 4349.000000, reward total was -10.000000. running mean: -5.218278
episode 4350.000000, reward total was -16.000000. running mean: -5.326095
episode 4351.000000, reward total was -11.000000. running mean: -5.382834
episode 4352.000000, reward total was -3.000000. running mean: -5.359006
episode 4353.000000, reward total was -12.000000. running mean: -5.425416
episode 4354.000000, reward total was -12.00000

episode 4454.000000, reward total was -2.000000. running mean: -5.578706
episode 4455.000000, reward total was 1.000000. running mean: -5.512919
episode 4456.000000, reward total was 1.000000. running mean: -5.447790
episode 4457.000000, reward total was -4.000000. running mean: -5.433312
episode 4458.000000, reward total was -4.000000. running mean: -5.418979
episode 4459.000000, reward total was -11.000000. running mean: -5.474789
episode 4460.000000, reward total was -9.000000. running mean: -5.510041
episode 4461.000000, reward total was 5.000000. running mean: -5.404941
episode 4462.000000, reward total was -1.000000. running mean: -5.360891
episode 4463.000000, reward total was -3.000000. running mean: -5.337282
episode 4464.000000, reward total was -13.000000. running mean: -5.413910
episode 4465.000000, reward total was -10.000000. running mean: -5.459770
episode 4466.000000, reward total was -7.000000. running mean: -5.475173
episode 4467.000000, reward total was -3.000000. ru

episode 4567.000000, reward total was -1.000000. running mean: -3.723757
episode 4568.000000, reward total was -9.000000. running mean: -3.776519
episode 4569.000000, reward total was -3.000000. running mean: -3.768754
episode 4570.000000, reward total was -13.000000. running mean: -3.861067
episode 4571.000000, reward total was 1.000000. running mean: -3.812456
episode 4572.000000, reward total was -5.000000. running mean: -3.824331
episode 4573.000000, reward total was 7.000000. running mean: -3.716088
episode 4574.000000, reward total was 3.000000. running mean: -3.648927
episode 4575.000000, reward total was -3.000000. running mean: -3.642438
episode 4576.000000, reward total was 6.000000. running mean: -3.546014
episode 4577.000000, reward total was -1.000000. running mean: -3.520553
episode 4578.000000, reward total was 6.000000. running mean: -3.425348
episode 4579.000000, reward total was -1.000000. running mean: -3.401094
episode 4580.000000, reward total was -2.000000. runnin

episode 4680.000000, reward total was -10.000000. running mean: -3.735329
episode 4681.000000, reward total was -6.000000. running mean: -3.757976
episode 4682.000000, reward total was -15.000000. running mean: -3.870396
episode 4683.000000, reward total was -3.000000. running mean: -3.861692
episode 4684.000000, reward total was -14.000000. running mean: -3.963075
episode 4685.000000, reward total was -10.000000. running mean: -4.023444
episode 4686.000000, reward total was -5.000000. running mean: -4.033210
episode 4687.000000, reward total was -15.000000. running mean: -4.142878
episode 4688.000000, reward total was 6.000000. running mean: -4.041449
episode 4689.000000, reward total was -1.000000. running mean: -4.011035
episode 4690.000000, reward total was -2.000000. running mean: -3.990924
episode 4691.000000, reward total was -10.000000. running mean: -4.051015
episode 4692.000000, reward total was -3.000000. running mean: -4.040505
episode 4693.000000, reward total was -1.00000

episode 4793.000000, reward total was -13.000000. running mean: -4.543840
episode 4794.000000, reward total was -4.000000. running mean: -4.538402
episode 4795.000000, reward total was -17.000000. running mean: -4.663018
episode 4796.000000, reward total was -13.000000. running mean: -4.746388
episode 4797.000000, reward total was -7.000000. running mean: -4.768924
episode 4798.000000, reward total was -7.000000. running mean: -4.791235
episode 4799.000000, reward total was -1.000000. running mean: -4.753322
episode 4800.000000, reward total was -9.000000. running mean: -4.795789
episode 4801.000000, reward total was -6.000000. running mean: -4.807831
episode 4802.000000, reward total was -12.000000. running mean: -4.879753
episode 4803.000000, reward total was -10.000000. running mean: -4.930955
episode 4804.000000, reward total was -4.000000. running mean: -4.921646
episode 4805.000000, reward total was -1.000000. running mean: -4.882429
episode 4806.000000, reward total was -7.00000

episode 4906.000000, reward total was 7.000000. running mean: -2.995981
episode 4907.000000, reward total was 5.000000. running mean: -2.916021
episode 4908.000000, reward total was -5.000000. running mean: -2.936861
episode 4909.000000, reward total was -12.000000. running mean: -3.027492
episode 4910.000000, reward total was -1.000000. running mean: -3.007217
episode 4911.000000, reward total was 10.000000. running mean: -2.877145
episode 4912.000000, reward total was -2.000000. running mean: -2.868373
episode 4913.000000, reward total was -7.000000. running mean: -2.909690
episode 4914.000000, reward total was 3.000000. running mean: -2.850593
episode 4915.000000, reward total was -12.000000. running mean: -2.942087
episode 4916.000000, reward total was 1.000000. running mean: -2.902666
episode 4917.000000, reward total was -6.000000. running mean: -2.933639
episode 4918.000000, reward total was -8.000000. running mean: -2.984303
episode 4919.000000, reward total was -6.000000. runn

episode 5019.000000, reward total was -3.000000. running mean: -3.368094
episode 5020.000000, reward total was -3.000000. running mean: -3.364413
episode 5021.000000, reward total was -6.000000. running mean: -3.390769
episode 5022.000000, reward total was 6.000000. running mean: -3.296861
episode 5023.000000, reward total was 4.000000. running mean: -3.223893
episode 5024.000000, reward total was 7.000000. running mean: -3.121654
episode 5025.000000, reward total was 9.000000. running mean: -3.000437
episode 5026.000000, reward total was 3.000000. running mean: -2.940433
episode 5027.000000, reward total was 8.000000. running mean: -2.831028
episode 5028.000000, reward total was 9.000000. running mean: -2.712718
episode 5029.000000, reward total was -13.000000. running mean: -2.815591
episode 5030.000000, reward total was 1.000000. running mean: -2.777435
episode 5031.000000, reward total was -5.000000. running mean: -2.799661
episode 5032.000000, reward total was 10.000000. running m

episode 5132.000000, reward total was 4.000000. running mean: -3.062660
episode 5133.000000, reward total was 3.000000. running mean: -3.002033
episode 5134.000000, reward total was -4.000000. running mean: -3.012013
episode 5135.000000, reward total was -6.000000. running mean: -3.041893
episode 5136.000000, reward total was -4.000000. running mean: -3.051474
episode 5137.000000, reward total was -9.000000. running mean: -3.110959
episode 5138.000000, reward total was -5.000000. running mean: -3.129850
episode 5139.000000, reward total was -4.000000. running mean: -3.138551
episode 5140.000000, reward total was 2.000000. running mean: -3.087166
episode 5141.000000, reward total was 3.000000. running mean: -3.026294
episode 5142.000000, reward total was -12.000000. running mean: -3.116031
episode 5143.000000, reward total was -8.000000. running mean: -3.164871
episode 5144.000000, reward total was 5.000000. running mean: -3.083222
episode 5145.000000, reward total was -3.000000. runnin

episode 5245.000000, reward total was -5.000000. running mean: -2.887130
episode 5246.000000, reward total was -1.000000. running mean: -2.868259
episode 5247.000000, reward total was -1.000000. running mean: -2.849576
episode 5248.000000, reward total was 9.000000. running mean: -2.731080
episode 5249.000000, reward total was 5.000000. running mean: -2.653770
episode 5250.000000, reward total was -9.000000. running mean: -2.717232
episode 5251.000000, reward total was 3.000000. running mean: -2.660060
episode 5252.000000, reward total was 5.000000. running mean: -2.583459
episode 5253.000000, reward total was 3.000000. running mean: -2.527624
episode 5254.000000, reward total was 8.000000. running mean: -2.422348
episode 5255.000000, reward total was 9.000000. running mean: -2.308125
episode 5256.000000, reward total was 5.000000. running mean: -2.235043
episode 5257.000000, reward total was 6.000000. running mean: -2.152693
episode 5258.000000, reward total was 5.000000. running mean

episode 5358.000000, reward total was -1.000000. running mean: -2.132480
episode 5359.000000, reward total was 2.000000. running mean: -2.091156
episode 5360.000000, reward total was -1.000000. running mean: -2.080244
episode 5361.000000, reward total was -7.000000. running mean: -2.129442
episode 5362.000000, reward total was -4.000000. running mean: -2.148147
episode 5363.000000, reward total was 8.000000. running mean: -2.046666
episode 5364.000000, reward total was -1.000000. running mean: -2.036199
episode 5365.000000, reward total was -1.000000. running mean: -2.025837
episode 5366.000000, reward total was -11.000000. running mean: -2.115579
episode 5367.000000, reward total was -8.000000. running mean: -2.174423
episode 5368.000000, reward total was -5.000000. running mean: -2.202679
episode 5369.000000, reward total was 3.000000. running mean: -2.150652
episode 5370.000000, reward total was 3.000000. running mean: -2.099145
episode 5371.000000, reward total was -1.000000. runni

episode 5471.000000, reward total was -9.000000. running mean: -2.482775
episode 5472.000000, reward total was -10.000000. running mean: -2.557947
episode 5473.000000, reward total was -9.000000. running mean: -2.622368
episode 5474.000000, reward total was 5.000000. running mean: -2.546144
episode 5475.000000, reward total was -2.000000. running mean: -2.540683
episode 5476.000000, reward total was 5.000000. running mean: -2.465276
episode 5477.000000, reward total was -13.000000. running mean: -2.570623
episode 5478.000000, reward total was -8.000000. running mean: -2.624917
episode 5479.000000, reward total was 6.000000. running mean: -2.538668
episode 5480.000000, reward total was -5.000000. running mean: -2.563281
episode 5481.000000, reward total was -13.000000. running mean: -2.667648
episode 5482.000000, reward total was -1.000000. running mean: -2.650972
episode 5483.000000, reward total was -5.000000. running mean: -2.674462
episode 5484.000000, reward total was -9.000000. ru