# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('BreakoutDeterministic-v4')
env.render()

True

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 3
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action + 1)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        r = np.clip(reward, -1, 1)

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network
            if(frame % Update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 400:
                torch.save(agent.model, "./save_model/breakout_dqn")
                sys.exit()

episode: 0   score: 2.0   memory length: 220   epsilon: 1.0    steps: 220     evaluation reward: 2.0
episode: 1   score: 1.0   memory length: 396   epsilon: 1.0    steps: 176     evaluation reward: 1.5
episode: 2   score: 0.0   memory length: 528   epsilon: 1.0    steps: 132     evaluation reward: 1.0
episode: 3   score: 2.0   memory length: 739   epsilon: 1.0    steps: 211     evaluation reward: 1.25
episode: 4   score: 0.0   memory length: 875   epsilon: 1.0    steps: 136     evaluation reward: 1.0
episode: 5   score: 2.0   memory length: 1067   epsilon: 1.0    steps: 192     evaluation reward: 1.1666666666666667
episode: 6   score: 1.0   memory length: 1224   epsilon: 1.0    steps: 157     evaluation reward: 1.1428571428571428
episode: 7   score: 2.0   memory length: 1409   epsilon: 1.0    steps: 185     evaluation reward: 1.25
episode: 8   score: 0.0   memory length: 1534   epsilon: 1.0    steps: 125     evaluation reward: 1.1111111111111112
episode: 9   score: 0.0   memory length:

episode: 73   score: 0.0   memory length: 13703   epsilon: 1.0    steps: 133     evaluation reward: 1.4054054054054055
episode: 74   score: 1.0   memory length: 13877   epsilon: 1.0    steps: 174     evaluation reward: 1.4
episode: 75   score: 0.0   memory length: 14008   epsilon: 1.0    steps: 131     evaluation reward: 1.381578947368421
episode: 76   score: 0.0   memory length: 14133   epsilon: 1.0    steps: 125     evaluation reward: 1.3636363636363635
episode: 77   score: 0.0   memory length: 14262   epsilon: 1.0    steps: 129     evaluation reward: 1.3461538461538463
episode: 78   score: 0.0   memory length: 14392   epsilon: 1.0    steps: 130     evaluation reward: 1.3291139240506329
episode: 79   score: 2.0   memory length: 14596   epsilon: 1.0    steps: 204     evaluation reward: 1.3375
episode: 80   score: 0.0   memory length: 14732   epsilon: 1.0    steps: 136     evaluation reward: 1.3209876543209877
episode: 81   score: 1.0   memory length: 14895   epsilon: 1.0    steps: 163

episode: 148   score: 0.0   memory length: 27472   epsilon: 1.0    steps: 128     evaluation reward: 1.35
episode: 149   score: 0.0   memory length: 27612   epsilon: 1.0    steps: 140     evaluation reward: 1.33
episode: 150   score: 3.0   memory length: 27850   epsilon: 1.0    steps: 238     evaluation reward: 1.31
episode: 151   score: 0.0   memory length: 27977   epsilon: 1.0    steps: 127     evaluation reward: 1.31
episode: 152   score: 1.0   memory length: 28136   epsilon: 1.0    steps: 159     evaluation reward: 1.3
episode: 153   score: 0.0   memory length: 28266   epsilon: 1.0    steps: 130     evaluation reward: 1.28
episode: 154   score: 0.0   memory length: 28395   epsilon: 1.0    steps: 129     evaluation reward: 1.28
episode: 155   score: 2.0   memory length: 28587   epsilon: 1.0    steps: 192     evaluation reward: 1.29
episode: 156   score: 2.0   memory length: 28805   epsilon: 1.0    steps: 218     evaluation reward: 1.31
episode: 157   score: 2.0   memory length: 2899

episode: 226   score: 1.0   memory length: 40453   epsilon: 1.0    steps: 158     evaluation reward: 1.04
episode: 227   score: 2.0   memory length: 40675   epsilon: 1.0    steps: 222     evaluation reward: 1.06
episode: 228   score: 2.0   memory length: 40889   epsilon: 1.0    steps: 214     evaluation reward: 1.06
episode: 229   score: 1.0   memory length: 41049   epsilon: 1.0    steps: 160     evaluation reward: 1.05
episode: 230   score: 2.0   memory length: 41253   epsilon: 1.0    steps: 204     evaluation reward: 1.06
episode: 231   score: 2.0   memory length: 41460   epsilon: 1.0    steps: 207     evaluation reward: 1.08
episode: 232   score: 1.0   memory length: 41633   epsilon: 1.0    steps: 173     evaluation reward: 1.09
episode: 233   score: 2.0   memory length: 41833   epsilon: 1.0    steps: 200     evaluation reward: 1.09
episode: 234   score: 0.0   memory length: 41969   epsilon: 1.0    steps: 136     evaluation reward: 1.08
episode: 235   score: 1.0   memory length: 421

episode: 301   score: 3.0   memory length: 53716   epsilon: 0.9926403400001598    steps: 254     evaluation reward: 1.08
episode: 302   score: 1.0   memory length: 53873   epsilon: 0.9923294800001665    steps: 157     evaluation reward: 1.08
episode: 303   score: 0.0   memory length: 54013   epsilon: 0.9920522800001725    steps: 140     evaluation reward: 1.06
episode: 304   score: 0.0   memory length: 54145   epsilon: 0.9917909200001782    steps: 132     evaluation reward: 1.06
episode: 305   score: 0.0   memory length: 54277   epsilon: 0.9915295600001839    steps: 132     evaluation reward: 1.03
episode: 306   score: 1.0   memory length: 54456   epsilon: 0.9911751400001916    steps: 179     evaluation reward: 1.03
episode: 307   score: 0.0   memory length: 54584   epsilon: 0.9909217000001971    steps: 128     evaluation reward: 1.02
episode: 308   score: 2.0   memory length: 54824   epsilon: 0.9904465000002074    steps: 240     evaluation reward: 1.04
episode: 309   score: 0.0   memo

episode: 369   score: 0.0   memory length: 66489   epsilon: 0.9673498000007088    steps: 138     evaluation reward: 1.25
episode: 370   score: 3.0   memory length: 66741   epsilon: 0.9668508400007196    steps: 252     evaluation reward: 1.26
episode: 371   score: 1.0   memory length: 66894   epsilon: 0.9665479000007262    steps: 153     evaluation reward: 1.27
episode: 372   score: 0.0   memory length: 67023   epsilon: 0.9662924800007318    steps: 129     evaluation reward: 1.27
episode: 373   score: 1.0   memory length: 67187   epsilon: 0.9659677600007388    steps: 164     evaluation reward: 1.28
episode: 374   score: 1.0   memory length: 67353   epsilon: 0.9656390800007459    steps: 166     evaluation reward: 1.27
episode: 375   score: 1.0   memory length: 67504   epsilon: 0.9653401000007524    steps: 151     evaluation reward: 1.28
episode: 376   score: 0.0   memory length: 67629   epsilon: 0.9650926000007578    steps: 125     evaluation reward: 1.28
episode: 377   score: 0.0   memo

episode: 437   score: 0.0   memory length: 78699   epsilon: 0.9431740000012336    steps: 126     evaluation reward: 1.33
episode: 438   score: 1.0   memory length: 78860   epsilon: 0.9428552200012406    steps: 161     evaluation reward: 1.33
episode: 439   score: 1.0   memory length: 79029   epsilon: 0.9425206000012478    steps: 169     evaluation reward: 1.31
episode: 440   score: 1.0   memory length: 79216   epsilon: 0.9421503400012559    steps: 187     evaluation reward: 1.27
episode: 441   score: 0.0   memory length: 79345   epsilon: 0.9418949200012614    steps: 129     evaluation reward: 1.25
episode: 442   score: 1.0   memory length: 79523   epsilon: 0.941542480001269    steps: 178     evaluation reward: 1.26
episode: 443   score: 4.0   memory length: 79803   epsilon: 0.9409880800012811    steps: 280     evaluation reward: 1.28
episode: 444   score: 1.0   memory length: 79962   epsilon: 0.9406732600012879    steps: 159     evaluation reward: 1.27
episode: 445   score: 0.0   memor

episode: 505   score: 4.0   memory length: 91302   epsilon: 0.9182200600017754    steps: 274     evaluation reward: 1.31
episode: 506   score: 5.0   memory length: 91619   epsilon: 0.917592400001789    steps: 317     evaluation reward: 1.36
episode: 507   score: 1.0   memory length: 91782   epsilon: 0.917269660001796    steps: 163     evaluation reward: 1.37
episode: 508   score: 2.0   memory length: 92002   epsilon: 0.9168340600018055    steps: 220     evaluation reward: 1.39
episode: 509   score: 1.0   memory length: 92180   epsilon: 0.9164816200018131    steps: 178     evaluation reward: 1.4
episode: 510   score: 3.0   memory length: 92431   epsilon: 0.9159846400018239    steps: 251     evaluation reward: 1.42
episode: 511   score: 3.0   memory length: 92680   epsilon: 0.9154916200018346    steps: 249     evaluation reward: 1.44
episode: 512   score: 2.0   memory length: 92864   epsilon: 0.9151273000018425    steps: 184     evaluation reward: 1.43
episode: 513   score: 1.0   memory 

episode: 573   score: 4.0   memory length: 105414   epsilon: 0.890278300002382    steps: 320     evaluation reward: 1.86
episode: 574   score: 3.0   memory length: 105673   epsilon: 0.8897654800023931    steps: 259     evaluation reward: 1.89
episode: 575   score: 1.0   memory length: 105841   epsilon: 0.8894328400024003    steps: 168     evaluation reward: 1.89
episode: 576   score: 3.0   memory length: 106062   epsilon: 0.8889952600024098    steps: 221     evaluation reward: 1.91
episode: 577   score: 1.0   memory length: 106222   epsilon: 0.8886784600024167    steps: 160     evaluation reward: 1.88
episode: 578   score: 1.0   memory length: 106402   epsilon: 0.8883220600024244    steps: 180     evaluation reward: 1.87
episode: 579   score: 0.0   memory length: 106539   epsilon: 0.8880508000024303    steps: 137     evaluation reward: 1.87
episode: 580   score: 2.0   memory length: 106728   epsilon: 0.8876765800024384    steps: 189     evaluation reward: 1.88
episode: 581   score: 0.0

episode: 641   score: 0.0   memory length: 119846   epsilon: 0.8617029400030023    steps: 129     evaluation reward: 2.12
episode: 642   score: 0.0   memory length: 119989   epsilon: 0.8614198000030084    steps: 143     evaluation reward: 2.1
episode: 643   score: 2.0   memory length: 120197   epsilon: 0.8610079600030174    steps: 208     evaluation reward: 2.1
episode: 644   score: 4.0   memory length: 120500   epsilon: 0.8604080200030304    steps: 303     evaluation reward: 2.13
episode: 645   score: 3.0   memory length: 120747   epsilon: 0.859918960003041    steps: 247     evaluation reward: 2.15
episode: 646   score: 3.0   memory length: 120980   epsilon: 0.859457620003051    steps: 233     evaluation reward: 2.17
episode: 647   score: 2.0   memory length: 121165   epsilon: 0.859091320003059    steps: 185     evaluation reward: 2.17
episode: 648   score: 1.0   memory length: 121328   epsilon: 0.858768580003066    steps: 163     evaluation reward: 2.17
episode: 649   score: 4.0   me

episode: 709   score: 0.0   memory length: 135831   epsilon: 0.8300526400036894    steps: 140     evaluation reward: 2.66
episode: 710   score: 3.0   memory length: 136061   epsilon: 0.8295972400036993    steps: 230     evaluation reward: 2.68
episode: 711   score: 2.0   memory length: 136262   epsilon: 0.8291992600037079    steps: 201     evaluation reward: 2.69
episode: 712   score: 1.0   memory length: 136434   epsilon: 0.8288587000037153    steps: 172     evaluation reward: 2.67
episode: 713   score: 3.0   memory length: 136665   epsilon: 0.8284013200037252    steps: 231     evaluation reward: 2.66
episode: 714   score: 3.0   memory length: 136897   epsilon: 0.8279419600037352    steps: 232     evaluation reward: 2.65
episode: 715   score: 2.0   memory length: 137106   epsilon: 0.8275281400037442    steps: 209     evaluation reward: 2.66
episode: 716   score: 1.0   memory length: 137273   epsilon: 0.8271974800037514    steps: 167     evaluation reward: 2.66
episode: 717   score: 6.

episode: 776   score: 6.0   memory length: 152155   epsilon: 0.7977311200043911    steps: 387     evaluation reward: 2.84
episode: 777   score: 3.0   memory length: 152388   epsilon: 0.7972697800044011    steps: 233     evaluation reward: 2.86
episode: 778   score: 5.0   memory length: 152716   epsilon: 0.7966203400044152    steps: 328     evaluation reward: 2.84
episode: 779   score: 2.0   memory length: 152912   epsilon: 0.7962322600044236    steps: 196     evaluation reward: 2.85
episode: 780   score: 5.0   memory length: 153222   epsilon: 0.7956184600044369    steps: 310     evaluation reward: 2.87
episode: 781   score: 2.0   memory length: 153415   epsilon: 0.7952363200044452    steps: 193     evaluation reward: 2.84
episode: 782   score: 4.0   memory length: 153709   epsilon: 0.7946542000044579    steps: 294     evaluation reward: 2.85
episode: 783   score: 6.0   memory length: 154038   epsilon: 0.794002780004472    steps: 329     evaluation reward: 2.88
episode: 784   score: 3.0

episode: 844   score: 5.0   memory length: 170245   epsilon: 0.7619129200051686    steps: 342     evaluation reward: 3.43
episode: 845   score: 6.0   memory length: 170646   epsilon: 0.7611189400051859    steps: 401     evaluation reward: 3.44
episode: 846   score: 6.0   memory length: 170972   epsilon: 0.7604734600051999    steps: 326     evaluation reward: 3.46
episode: 847   score: 3.0   memory length: 171257   epsilon: 0.7599091600052121    steps: 285     evaluation reward: 3.47
episode: 848   score: 4.0   memory length: 171552   epsilon: 0.7593250600052248    steps: 295     evaluation reward: 3.46
episode: 849   score: 2.0   memory length: 171756   epsilon: 0.7589211400052336    steps: 204     evaluation reward: 3.41
episode: 850   score: 6.0   memory length: 172102   epsilon: 0.7582360600052485    steps: 346     evaluation reward: 3.47
episode: 851   score: 0.0   memory length: 172233   epsilon: 0.7579766800052541    steps: 131     evaluation reward: 3.43
episode: 852   score: 1.

episode: 912   score: 0.0   memory length: 183530   epsilon: 0.7356086200057397    steps: 124     evaluation reward: 2.38
episode: 913   score: 0.0   memory length: 183655   epsilon: 0.735361120005745    steps: 125     evaluation reward: 2.34
episode: 914   score: 3.0   memory length: 183873   epsilon: 0.7349294800057544    steps: 218     evaluation reward: 2.32
episode: 915   score: 0.0   memory length: 183997   epsilon: 0.7346839600057598    steps: 124     evaluation reward: 2.28
episode: 916   score: 0.0   memory length: 184126   epsilon: 0.7344285400057653    steps: 129     evaluation reward: 2.23
episode: 917   score: 3.0   memory length: 184382   epsilon: 0.7339216600057763    steps: 256     evaluation reward: 2.22
episode: 918   score: 3.0   memory length: 184631   epsilon: 0.733428640005787    steps: 249     evaluation reward: 2.21
episode: 919   score: 0.0   memory length: 184755   epsilon: 0.7331831200057923    steps: 124     evaluation reward: 2.19
episode: 920   score: 1.0 

episode: 980   score: 2.0   memory length: 195847   epsilon: 0.7112209600062691    steps: 199     evaluation reward: 1.33
episode: 981   score: 0.0   memory length: 195973   epsilon: 0.7109714800062745    steps: 126     evaluation reward: 1.32
episode: 982   score: 2.0   memory length: 196171   epsilon: 0.710579440006283    steps: 198     evaluation reward: 1.32
episode: 983   score: 2.0   memory length: 196373   epsilon: 0.7101794800062917    steps: 202     evaluation reward: 1.33
episode: 984   score: 3.0   memory length: 196584   epsilon: 0.7097617000063008    steps: 211     evaluation reward: 1.32
episode: 985   score: 1.0   memory length: 196737   epsilon: 0.7094587600063074    steps: 153     evaluation reward: 1.33
episode: 986   score: 0.0   memory length: 196862   epsilon: 0.7092112600063127    steps: 125     evaluation reward: 1.33
episode: 987   score: 1.0   memory length: 197033   epsilon: 0.7088726800063201    steps: 171     evaluation reward: 1.33
episode: 988   score: 1.0

episode: 1047   score: 4.0   memory length: 207719   epsilon: 0.6877144000067794    steps: 318     evaluation reward: 1.36
episode: 1048   score: 2.0   memory length: 207923   epsilon: 0.6873104800067882    steps: 204     evaluation reward: 1.38
episode: 1049   score: 1.0   memory length: 208079   epsilon: 0.6870016000067949    steps: 156     evaluation reward: 1.37
episode: 1050   score: 1.0   memory length: 208252   epsilon: 0.6866590600068023    steps: 173     evaluation reward: 1.37
episode: 1051   score: 0.0   memory length: 208380   epsilon: 0.6864056200068078    steps: 128     evaluation reward: 1.37
episode: 1052   score: 2.0   memory length: 208579   epsilon: 0.6860116000068164    steps: 199     evaluation reward: 1.37
episode: 1053   score: 0.0   memory length: 208708   epsilon: 0.6857561800068219    steps: 129     evaluation reward: 1.37
episode: 1054   score: 1.0   memory length: 208861   epsilon: 0.6854532400068285    steps: 153     evaluation reward: 1.35
episode: 1055   

episode: 1114   score: 3.0   memory length: 219149   epsilon: 0.6650830000072707    steps: 247     evaluation reward: 1.18
episode: 1115   score: 2.0   memory length: 219349   epsilon: 0.6646870000072793    steps: 200     evaluation reward: 1.19
episode: 1116   score: 3.0   memory length: 219579   epsilon: 0.6642316000072892    steps: 230     evaluation reward: 1.22
episode: 1117   score: 3.0   memory length: 219830   epsilon: 0.6637346200073    steps: 251     evaluation reward: 1.24
episode: 1118   score: 0.0   memory length: 219955   epsilon: 0.6634871200073054    steps: 125     evaluation reward: 1.22
episode: 1119   score: 1.0   memory length: 220107   epsilon: 0.6631861600073119    steps: 152     evaluation reward: 1.2
episode: 1120   score: 1.0   memory length: 220260   epsilon: 0.6628832200073185    steps: 153     evaluation reward: 1.21
episode: 1121   score: 1.0   memory length: 220412   epsilon: 0.662582260007325    steps: 152     evaluation reward: 1.19
episode: 1122   score

episode: 1181   score: 2.0   memory length: 231549   epsilon: 0.6405310000078037    steps: 198     evaluation reward: 1.38
episode: 1182   score: 2.0   memory length: 231746   epsilon: 0.6401409400078122    steps: 197     evaluation reward: 1.38
episode: 1183   score: 0.0   memory length: 231877   epsilon: 0.6398815600078178    steps: 131     evaluation reward: 1.37
episode: 1184   score: 1.0   memory length: 232031   epsilon: 0.6395766400078244    steps: 154     evaluation reward: 1.37
episode: 1185   score: 0.0   memory length: 232155   epsilon: 0.6393311200078298    steps: 124     evaluation reward: 1.36
episode: 1186   score: 0.0   memory length: 232280   epsilon: 0.6390836200078351    steps: 125     evaluation reward: 1.35
episode: 1187   score: 0.0   memory length: 232414   epsilon: 0.6388183000078409    steps: 134     evaluation reward: 1.35
episode: 1188   score: 2.0   memory length: 232634   epsilon: 0.6383827000078504    steps: 220     evaluation reward: 1.37
episode: 1189   

episode: 1248   score: 1.0   memory length: 242743   epsilon: 0.6183668800082849    steps: 169     evaluation reward: 1.25
episode: 1249   score: 0.0   memory length: 242870   epsilon: 0.6181154200082903    steps: 127     evaluation reward: 1.23
episode: 1250   score: 0.0   memory length: 242993   epsilon: 0.6178718800082956    steps: 123     evaluation reward: 1.21
episode: 1251   score: 2.0   memory length: 243215   epsilon: 0.6174323200083052    steps: 222     evaluation reward: 1.21
episode: 1252   score: 2.0   memory length: 243416   epsilon: 0.6170343400083138    steps: 201     evaluation reward: 1.22
episode: 1253   score: 0.0   memory length: 243540   epsilon: 0.6167888200083191    steps: 124     evaluation reward: 1.2
episode: 1254   score: 0.0   memory length: 243662   epsilon: 0.6165472600083244    steps: 122     evaluation reward: 1.2
episode: 1255   score: 0.0   memory length: 243786   epsilon: 0.6163017400083297    steps: 124     evaluation reward: 1.19
episode: 1256   sc

episode: 1315   score: 2.0   memory length: 254275   epsilon: 0.5955335200087806    steps: 197     evaluation reward: 1.25
episode: 1316   score: 0.0   memory length: 254398   epsilon: 0.5952899800087859    steps: 123     evaluation reward: 1.23
episode: 1317   score: 4.0   memory length: 254689   epsilon: 0.5947138000087984    steps: 291     evaluation reward: 1.26
episode: 1318   score: 1.0   memory length: 254840   epsilon: 0.5944148200088049    steps: 151     evaluation reward: 1.25
episode: 1319   score: 1.0   memory length: 254990   epsilon: 0.5941178200088113    steps: 150     evaluation reward: 1.24
episode: 1320   score: 0.0   memory length: 255113   epsilon: 0.5938742800088166    steps: 123     evaluation reward: 1.22
episode: 1321   score: 0.0   memory length: 255236   epsilon: 0.5936307400088219    steps: 123     evaluation reward: 1.21
episode: 1322   score: 4.0   memory length: 255485   epsilon: 0.5931377200088326    steps: 249     evaluation reward: 1.25
episode: 1323   

episode: 1382   score: 2.0   memory length: 266175   epsilon: 0.5719715200092921    steps: 222     evaluation reward: 1.36
episode: 1383   score: 4.0   memory length: 266450   epsilon: 0.5714270200093039    steps: 275     evaluation reward: 1.38
episode: 1384   score: 2.0   memory length: 266669   epsilon: 0.5709934000093133    steps: 219     evaluation reward: 1.37
episode: 1385   score: 0.0   memory length: 266793   epsilon: 0.5707478800093186    steps: 124     evaluation reward: 1.36
episode: 1386   score: 0.0   memory length: 266921   epsilon: 0.5704944400093241    steps: 128     evaluation reward: 1.35
episode: 1387   score: 0.0   memory length: 267046   epsilon: 0.5702469400093295    steps: 125     evaluation reward: 1.35
episode: 1388   score: 3.0   memory length: 267279   epsilon: 0.5697856000093395    steps: 233     evaluation reward: 1.36
episode: 1389   score: 1.0   memory length: 267451   epsilon: 0.5694450400093469    steps: 172     evaluation reward: 1.36
episode: 1390   

episode: 1449   score: 2.0   memory length: 278228   epsilon: 0.5481065800098102    steps: 198     evaluation reward: 1.39
episode: 1450   score: 0.0   memory length: 278352   epsilon: 0.5478610600098155    steps: 124     evaluation reward: 1.36
episode: 1451   score: 2.0   memory length: 278549   epsilon: 0.547471000009824    steps: 197     evaluation reward: 1.37
episode: 1452   score: 2.0   memory length: 278769   epsilon: 0.5470354000098334    steps: 220     evaluation reward: 1.37
episode: 1453   score: 0.0   memory length: 278893   epsilon: 0.5467898800098387    steps: 124     evaluation reward: 1.34
episode: 1454   score: 0.0   memory length: 279017   epsilon: 0.5465443600098441    steps: 124     evaluation reward: 1.34
episode: 1455   score: 1.0   memory length: 279186   epsilon: 0.5462097400098513    steps: 169     evaluation reward: 1.31
episode: 1456   score: 3.0   memory length: 279459   epsilon: 0.5456692000098631    steps: 273     evaluation reward: 1.33
episode: 1457   s

episode: 1516   score: 2.0   memory length: 290004   epsilon: 0.5247901000103163    steps: 203     evaluation reward: 1.3
episode: 1517   score: 2.0   memory length: 290202   epsilon: 0.5243980600103249    steps: 198     evaluation reward: 1.32
episode: 1518   score: 0.0   memory length: 290326   epsilon: 0.5241525400103302    steps: 124     evaluation reward: 1.32
episode: 1519   score: 1.0   memory length: 290495   epsilon: 0.5238179200103374    steps: 169     evaluation reward: 1.31
episode: 1520   score: 2.0   memory length: 290696   epsilon: 0.5234199400103461    steps: 201     evaluation reward: 1.32
episode: 1521   score: 1.0   memory length: 290847   epsilon: 0.5231209600103526    steps: 151     evaluation reward: 1.33
episode: 1522   score: 3.0   memory length: 291119   epsilon: 0.5225824000103643    steps: 272     evaluation reward: 1.33
episode: 1523   score: 2.0   memory length: 291316   epsilon: 0.5221923400103727    steps: 197     evaluation reward: 1.32
episode: 1524   s

episode: 1583   score: 0.0   memory length: 301695   epsilon: 0.5016419200108189    steps: 122     evaluation reward: 1.33
episode: 1584   score: 1.0   memory length: 301870   epsilon: 0.5012954200108264    steps: 175     evaluation reward: 1.32
episode: 1585   score: 2.0   memory length: 302072   epsilon: 0.5008954600108351    steps: 202     evaluation reward: 1.34
episode: 1586   score: 1.0   memory length: 302226   epsilon: 0.5005905400108417    steps: 154     evaluation reward: 1.34
episode: 1587   score: 2.0   memory length: 302444   epsilon: 0.5001589000108511    steps: 218     evaluation reward: 1.36
episode: 1588   score: 1.0   memory length: 302597   epsilon: 0.4998559600108536    steps: 153     evaluation reward: 1.37
episode: 1589   score: 4.0   memory length: 302894   epsilon: 0.49926790001084986    steps: 297     evaluation reward: 1.4
episode: 1590   score: 1.0   memory length: 303049   epsilon: 0.4989610000108479    steps: 155     evaluation reward: 1.4
episode: 1591   s

episode: 1650   score: 2.0   memory length: 313392   epsilon: 0.47848186001071835    steps: 198     evaluation reward: 1.17
episode: 1651   score: 2.0   memory length: 313589   epsilon: 0.4780918000107159    steps: 197     evaluation reward: 1.18
episode: 1652   score: 4.0   memory length: 313846   epsilon: 0.47758294001071266    steps: 257     evaluation reward: 1.19
episode: 1653   score: 0.0   memory length: 313969   epsilon: 0.4773394000107111    steps: 123     evaluation reward: 1.15
episode: 1654   score: 0.0   memory length: 314093   epsilon: 0.47709388001070957    steps: 124     evaluation reward: 1.14
episode: 1655   score: 0.0   memory length: 314215   epsilon: 0.47685232001070804    steps: 122     evaluation reward: 1.11
episode: 1656   score: 0.0   memory length: 314338   epsilon: 0.4766087800107065    steps: 123     evaluation reward: 1.1
episode: 1657   score: 2.0   memory length: 314519   epsilon: 0.47625040001070423    steps: 181     evaluation reward: 1.12
episode: 165

episode: 1717   score: 1.0   memory length: 324738   epsilon: 0.4560167800105762    steps: 151     evaluation reward: 1.25
episode: 1718   score: 1.0   memory length: 324888   epsilon: 0.45571978001057434    steps: 150     evaluation reward: 1.25
episode: 1719   score: 2.0   memory length: 325108   epsilon: 0.4552841800105716    steps: 220     evaluation reward: 1.26
episode: 1720   score: 0.0   memory length: 325232   epsilon: 0.45503866001057003    steps: 124     evaluation reward: 1.25
episode: 1721   score: 0.0   memory length: 325357   epsilon: 0.45479116001056846    steps: 125     evaluation reward: 1.24
episode: 1722   score: 3.0   memory length: 325603   epsilon: 0.4543040800105654    steps: 246     evaluation reward: 1.25
episode: 1723   score: 2.0   memory length: 325789   epsilon: 0.45393580001056305    steps: 186     evaluation reward: 1.27
episode: 1724   score: 0.0   memory length: 325911   epsilon: 0.4536942400105615    steps: 122     evaluation reward: 1.27
episode: 172

episode: 1784   score: 1.0   memory length: 335487   epsilon: 0.43473376001044156    steps: 152     evaluation reward: 1.08
episode: 1785   score: 2.0   memory length: 335685   epsilon: 0.4343417200104391    steps: 198     evaluation reward: 1.08
episode: 1786   score: 2.0   memory length: 335885   epsilon: 0.4339457200104366    steps: 200     evaluation reward: 1.1
episode: 1787   score: 3.0   memory length: 336112   epsilon: 0.43349626001043373    steps: 227     evaluation reward: 1.12
episode: 1788   score: 0.0   memory length: 336237   epsilon: 0.43324876001043217    steps: 125     evaluation reward: 1.11
episode: 1789   score: 2.0   memory length: 336435   epsilon: 0.4328567200104297    steps: 198     evaluation reward: 1.12
episode: 1790   score: 3.0   memory length: 336680   epsilon: 0.4323716200104266    steps: 245     evaluation reward: 1.12
episode: 1791   score: 1.0   memory length: 336833   epsilon: 0.4320686800104247    steps: 153     evaluation reward: 1.13
episode: 1792 

episode: 1851   score: 2.0   memory length: 346973   epsilon: 0.4119914800102977    steps: 200     evaluation reward: 1.23
episode: 1852   score: 3.0   memory length: 347222   epsilon: 0.41149846001029455    steps: 249     evaluation reward: 1.25
episode: 1853   score: 0.0   memory length: 347344   epsilon: 0.411256900010293    steps: 122     evaluation reward: 1.22
episode: 1854   score: 0.0   memory length: 347467   epsilon: 0.4110133600102915    steps: 123     evaluation reward: 1.22
episode: 1855   score: 3.0   memory length: 347718   epsilon: 0.41051638001028834    steps: 251     evaluation reward: 1.24
episode: 1856   score: 1.0   memory length: 347888   epsilon: 0.4101797800102862    steps: 170     evaluation reward: 1.25
episode: 1857   score: 2.0   memory length: 348088   epsilon: 0.4097837800102837    steps: 200     evaluation reward: 1.26
episode: 1858   score: 1.0   memory length: 348240   epsilon: 0.4094828200102818    steps: 152     evaluation reward: 1.26
episode: 1859  

episode: 1918   score: 1.0   memory length: 358035   epsilon: 0.3900887200101591    steps: 151     evaluation reward: 1.21
episode: 1919   score: 2.0   memory length: 358232   epsilon: 0.38969866001015663    steps: 197     evaluation reward: 1.21
episode: 1920   score: 1.0   memory length: 358400   epsilon: 0.3893660200101545    steps: 168     evaluation reward: 1.2
episode: 1921   score: 0.0   memory length: 358523   epsilon: 0.389122480010153    steps: 123     evaluation reward: 1.18
episode: 1922   score: 0.0   memory length: 358645   epsilon: 0.38888092001015145    steps: 122     evaluation reward: 1.17
episode: 1923   score: 2.0   memory length: 358863   epsilon: 0.3884492800101487    steps: 218     evaluation reward: 1.18
episode: 1924   score: 0.0   memory length: 358985   epsilon: 0.3882077200101472    steps: 122     evaluation reward: 1.16
episode: 1925   score: 2.0   memory length: 359167   epsilon: 0.3878473600101449    steps: 182     evaluation reward: 1.15
episode: 1926   

episode: 1985   score: 2.0   memory length: 369777   epsilon: 0.366839560010012    steps: 199     evaluation reward: 1.14
episode: 1986   score: 3.0   memory length: 370046   epsilon: 0.36630694001000863    steps: 269     evaluation reward: 1.17
episode: 1987   score: 2.0   memory length: 370228   epsilon: 0.36594658001000635    steps: 182     evaluation reward: 1.19
episode: 1988   score: 4.0   memory length: 370501   epsilon: 0.36540604001000293    steps: 273     evaluation reward: 1.21
episode: 1989   score: 2.0   memory length: 370720   epsilon: 0.3649724200100002    steps: 219     evaluation reward: 1.22
episode: 1990   score: 0.0   memory length: 370842   epsilon: 0.36473086000999866    steps: 122     evaluation reward: 1.21
episode: 1991   score: 3.0   memory length: 371090   epsilon: 0.36423982000999555    steps: 248     evaluation reward: 1.23
episode: 1992   score: 2.0   memory length: 371289   epsilon: 0.36384580000999306    steps: 199     evaluation reward: 1.24
episode: 19

episode: 2052   score: 0.0   memory length: 381196   epsilon: 0.34422994000986895    steps: 122     evaluation reward: 1.26
episode: 2053   score: 3.0   memory length: 381443   epsilon: 0.34374088000986586    steps: 247     evaluation reward: 1.27
episode: 2054   score: 2.0   memory length: 381640   epsilon: 0.3433508200098634    steps: 197     evaluation reward: 1.28
episode: 2055   score: 3.0   memory length: 381907   epsilon: 0.34282216000986004    steps: 267     evaluation reward: 1.29
episode: 2056   score: 0.0   memory length: 382031   epsilon: 0.3425766400098585    steps: 124     evaluation reward: 1.29
episode: 2057   score: 2.0   memory length: 382230   epsilon: 0.342182620009856    steps: 199     evaluation reward: 1.31
episode: 2058   score: 2.0   memory length: 382428   epsilon: 0.3417905800098535    steps: 198     evaluation reward: 1.33
episode: 2059   score: 1.0   memory length: 382579   epsilon: 0.3414916000098516    steps: 151     evaluation reward: 1.34
episode: 2060 

episode: 2119   score: 2.0   memory length: 392602   epsilon: 0.32164606000972606    steps: 198     evaluation reward: 1.19
episode: 2120   score: 2.0   memory length: 392799   epsilon: 0.3212560000097236    steps: 197     evaluation reward: 1.21
episode: 2121   score: 0.0   memory length: 392922   epsilon: 0.32101246000972206    steps: 123     evaluation reward: 1.18
episode: 2122   score: 1.0   memory length: 393073   epsilon: 0.32071348000972016    steps: 151     evaluation reward: 1.19
episode: 2123   score: 0.0   memory length: 393200   epsilon: 0.3204620200097186    steps: 127     evaluation reward: 1.19
episode: 2124   score: 0.0   memory length: 393322   epsilon: 0.32022046000971705    steps: 122     evaluation reward: 1.18
episode: 2125   score: 1.0   memory length: 393476   epsilon: 0.3199155400097151    steps: 154     evaluation reward: 1.19
episode: 2126   score: 3.0   memory length: 393722   epsilon: 0.31942846000971203    steps: 246     evaluation reward: 1.22
episode: 21

episode: 2186   score: 0.0   memory length: 404037   epsilon: 0.2990047600095828    steps: 123     evaluation reward: 1.29
episode: 2187   score: 0.0   memory length: 404159   epsilon: 0.2987632000095813    steps: 122     evaluation reward: 1.29
episode: 2188   score: 1.0   memory length: 404330   epsilon: 0.29842462000957914    steps: 171     evaluation reward: 1.3
episode: 2189   score: 2.0   memory length: 404527   epsilon: 0.2980345600095767    steps: 197     evaluation reward: 1.32
episode: 2190   score: 0.0   memory length: 404649   epsilon: 0.29779300000957515    steps: 122     evaluation reward: 1.32
episode: 2191   score: 2.0   memory length: 404847   epsilon: 0.29740096000957267    steps: 198     evaluation reward: 1.32
episode: 2192   score: 2.0   memory length: 405045   epsilon: 0.2970089200095702    steps: 198     evaluation reward: 1.33
episode: 2193   score: 1.0   memory length: 405214   epsilon: 0.29667430000956807    steps: 169     evaluation reward: 1.34
episode: 2194

episode: 2253   score: 2.0   memory length: 415276   epsilon: 0.276751540009442    steps: 199     evaluation reward: 1.29
episode: 2254   score: 0.0   memory length: 415400   epsilon: 0.27650602000944047    steps: 124     evaluation reward: 1.28
episode: 2255   score: 1.0   memory length: 415570   epsilon: 0.27616942000943834    steps: 170     evaluation reward: 1.27
episode: 2256   score: 4.0   memory length: 415846   epsilon: 0.2756229400094349    steps: 276     evaluation reward: 1.29
episode: 2257   score: 1.0   memory length: 416014   epsilon: 0.2752903000094328    steps: 168     evaluation reward: 1.3
episode: 2258   score: 0.0   memory length: 416136   epsilon: 0.27504874000943125    steps: 122     evaluation reward: 1.28
episode: 2259   score: 0.0   memory length: 416259   epsilon: 0.2748052000094297    steps: 123     evaluation reward: 1.25
episode: 2260   score: 0.0   memory length: 416384   epsilon: 0.27455770000942814    steps: 125     evaluation reward: 1.23
episode: 2261 

episode: 2320   score: 0.0   memory length: 425882   epsilon: 0.25575166000930916    steps: 122     evaluation reward: 1.03
episode: 2321   score: 2.0   memory length: 426079   epsilon: 0.2553616000093067    steps: 197     evaluation reward: 1.04
episode: 2322   score: 1.0   memory length: 426229   epsilon: 0.2550646000093048    steps: 150     evaluation reward: 1.05
episode: 2323   score: 0.0   memory length: 426351   epsilon: 0.2548230400093033    steps: 122     evaluation reward: 1.05
episode: 2324   score: 0.0   memory length: 426474   epsilon: 0.25457950000930174    steps: 123     evaluation reward: 1.04
episode: 2325   score: 2.0   memory length: 426671   epsilon: 0.2541894400092993    steps: 197     evaluation reward: 1.04
episode: 2326   score: 2.0   memory length: 426868   epsilon: 0.2537993800092968    steps: 197     evaluation reward: 1.06
episode: 2327   score: 0.0   memory length: 426990   epsilon: 0.2535578200092953    steps: 122     evaluation reward: 1.04
episode: 2328 

episode: 2387   score: 0.0   memory length: 436185   epsilon: 0.2353517200091801    steps: 123     evaluation reward: 0.9
episode: 2388   score: 1.0   memory length: 436353   epsilon: 0.23501908000917798    steps: 168     evaluation reward: 0.9
episode: 2389   score: 1.0   memory length: 436504   epsilon: 0.2347201000091761    steps: 151     evaluation reward: 0.9
episode: 2390   score: 4.0   memory length: 436801   epsilon: 0.23413204000917237    steps: 297     evaluation reward: 0.94
episode: 2391   score: 2.0   memory length: 437019   epsilon: 0.23370040000916964    steps: 218     evaluation reward: 0.95
episode: 2392   score: 2.0   memory length: 437219   epsilon: 0.23330440000916713    steps: 200     evaluation reward: 0.95
episode: 2393   score: 3.0   memory length: 437466   epsilon: 0.23281534000916404    steps: 247     evaluation reward: 0.97
episode: 2394   score: 1.0   memory length: 437618   epsilon: 0.23251438000916214    steps: 152     evaluation reward: 0.97
episode: 2395

episode: 2454   score: 2.0   memory length: 447310   epsilon: 0.21332422000904072    steps: 197     evaluation reward: 1.0
episode: 2455   score: 0.0   memory length: 447433   epsilon: 0.21308068000903918    steps: 123     evaluation reward: 0.99
episode: 2456   score: 1.0   memory length: 447583   epsilon: 0.2127836800090373    steps: 150     evaluation reward: 1.0
episode: 2457   score: 1.0   memory length: 447736   epsilon: 0.21248074000903538    steps: 153     evaluation reward: 1.01
episode: 2458   score: 0.0   memory length: 447858   epsilon: 0.21223918000903386    steps: 122     evaluation reward: 1.01
episode: 2459   score: 1.0   memory length: 448008   epsilon: 0.21194218000903198    steps: 150     evaluation reward: 1.0
episode: 2460   score: 2.0   memory length: 448205   epsilon: 0.2115521200090295    steps: 197     evaluation reward: 1.01
episode: 2461   score: 0.0   memory length: 448327   epsilon: 0.21131056000902798    steps: 122     evaluation reward: 1.01
episode: 2462

episode: 2520   score: 0.0   memory length: 458195   epsilon: 0.19177192000890436    steps: 122     evaluation reward: 1.09
episode: 2521   score: 3.0   memory length: 458420   epsilon: 0.19132642000890154    steps: 225     evaluation reward: 1.12
episode: 2522   score: 0.0   memory length: 458542   epsilon: 0.19108486000890001    steps: 122     evaluation reward: 1.11
episode: 2523   score: 0.0   memory length: 458664   epsilon: 0.1908433000088985    steps: 122     evaluation reward: 1.11
episode: 2524   score: 2.0   memory length: 458862   epsilon: 0.190451260008896    steps: 198     evaluation reward: 1.12
episode: 2525   score: 2.0   memory length: 459059   epsilon: 0.19006120000889354    steps: 197     evaluation reward: 1.12
episode: 2526   score: 3.0   memory length: 459284   epsilon: 0.18961570000889072    steps: 225     evaluation reward: 1.15
episode: 2527   score: 3.0   memory length: 459529   epsilon: 0.18913060000888765    steps: 245     evaluation reward: 1.16
episode: 25

episode: 2587   score: 0.0   memory length: 468956   epsilon: 0.17046514000876956    steps: 122     evaluation reward: 1.06
episode: 2588   score: 2.0   memory length: 469141   epsilon: 0.17009884000876724    steps: 185     evaluation reward: 1.08
episode: 2589   score: 1.0   memory length: 469310   epsilon: 0.16976422000876512    steps: 169     evaluation reward: 1.07
episode: 2590   score: 0.0   memory length: 469436   epsilon: 0.16951474000876354    steps: 126     evaluation reward: 1.05
episode: 2591   score: 0.0   memory length: 469559   epsilon: 0.169271200008762    steps: 123     evaluation reward: 1.05
episode: 2592   score: 2.0   memory length: 469740   epsilon: 0.16891282000875973    steps: 181     evaluation reward: 1.07
episode: 2593   score: 2.0   memory length: 469938   epsilon: 0.16852078000875725    steps: 198     evaluation reward: 1.07
episode: 2594   score: 2.0   memory length: 470135   epsilon: 0.16813072000875479    steps: 197     evaluation reward: 1.06
episode: 2

episode: 2654   score: 1.0   memory length: 479684   epsilon: 0.14922370000863516    steps: 151     evaluation reward: 1.03
episode: 2655   score: 0.0   memory length: 479807   epsilon: 0.14898016000863362    steps: 123     evaluation reward: 1.03
episode: 2656   score: 0.0   memory length: 479929   epsilon: 0.1487386000086321    steps: 122     evaluation reward: 1.01
episode: 2657   score: 0.0   memory length: 480051   epsilon: 0.14849704000863057    steps: 122     evaluation reward: 1.01
episode: 2658   score: 2.0   memory length: 480248   epsilon: 0.1481069800086281    steps: 197     evaluation reward: 1.0
episode: 2659   score: 0.0   memory length: 480370   epsilon: 0.14786542000862657    steps: 122     evaluation reward: 0.99
episode: 2660   score: 0.0   memory length: 480492   epsilon: 0.14762386000862504    steps: 122     evaluation reward: 0.98
episode: 2661   score: 1.0   memory length: 480643   epsilon: 0.14732488000862315    steps: 151     evaluation reward: 0.99
episode: 26

episode: 2721   score: 0.0   memory length: 490071   epsilon: 0.12865744000850504    steps: 122     evaluation reward: 0.96
episode: 2722   score: 0.0   memory length: 490193   epsilon: 0.1284158800085035    steps: 122     evaluation reward: 0.96
episode: 2723   score: 2.0   memory length: 490390   epsilon: 0.12802582000850105    steps: 197     evaluation reward: 0.96
episode: 2724   score: 0.0   memory length: 490512   epsilon: 0.12778426000849952    steps: 122     evaluation reward: 0.96
episode: 2725   score: 1.0   memory length: 490662   epsilon: 0.12748726000849764    steps: 150     evaluation reward: 0.95
episode: 2726   score: 0.0   memory length: 490784   epsilon: 0.1272457000084961    steps: 122     evaluation reward: 0.95
episode: 2727   score: 2.0   memory length: 490981   epsilon: 0.12685564000849364    steps: 197     evaluation reward: 0.97
episode: 2728   score: 0.0   memory length: 491103   epsilon: 0.1266140800084921    steps: 122     evaluation reward: 0.97
episode: 27

episode: 2787   score: 1.0   memory length: 500676   epsilon: 0.10765954000849373    steps: 150     evaluation reward: 1.15
episode: 2788   score: 2.0   memory length: 500873   epsilon: 0.107269480008494    steps: 197     evaluation reward: 1.14
episode: 2789   score: 0.0   memory length: 500996   epsilon: 0.10702594000849416    steps: 123     evaluation reward: 1.13
episode: 2790   score: 0.0   memory length: 501118   epsilon: 0.10678438000849433    steps: 122     evaluation reward: 1.13
episode: 2791   score: 0.0   memory length: 501240   epsilon: 0.1065428200084945    steps: 122     evaluation reward: 1.11
episode: 2792   score: 0.0   memory length: 501362   epsilon: 0.10630126000849466    steps: 122     evaluation reward: 1.09
episode: 2793   score: 3.0   memory length: 501608   epsilon: 0.10581418000849499    steps: 246     evaluation reward: 1.11
episode: 2794   score: 0.0   memory length: 501730   epsilon: 0.10557262000849515    steps: 122     evaluation reward: 1.1
episode: 279

episode: 2854   score: 1.0   memory length: 511248   epsilon: 0.08672698000850801    steps: 150     evaluation reward: 1.07
episode: 2855   score: 4.0   memory length: 511504   epsilon: 0.08622010000850835    steps: 256     evaluation reward: 1.1
episode: 2856   score: 0.0   memory length: 511627   epsilon: 0.08597656000850852    steps: 123     evaluation reward: 1.1
episode: 2857   score: 0.0   memory length: 511749   epsilon: 0.08573500000850869    steps: 122     evaluation reward: 1.1
episode: 2858   score: 0.0   memory length: 511871   epsilon: 0.08549344000850885    steps: 122     evaluation reward: 1.08
episode: 2859   score: 2.0   memory length: 512054   epsilon: 0.0851311000085091    steps: 183     evaluation reward: 1.09
episode: 2860   score: 0.0   memory length: 512176   epsilon: 0.08488954000850926    steps: 122     evaluation reward: 1.07
episode: 2861   score: 0.0   memory length: 512298   epsilon: 0.08464798000850943    steps: 122     evaluation reward: 1.07
episode: 286

episode: 2921   score: 0.0   memory length: 521089   epsilon: 0.0672418000085213    steps: 122     evaluation reward: 0.82
episode: 2922   score: 1.0   memory length: 521239   epsilon: 0.0669448000085215    steps: 150     evaluation reward: 0.8
episode: 2923   score: 0.0   memory length: 521361   epsilon: 0.06670324000852167    steps: 122     evaluation reward: 0.8
episode: 2924   score: 0.0   memory length: 521483   epsilon: 0.06646168000852183    steps: 122     evaluation reward: 0.8
episode: 2925   score: 0.0   memory length: 521606   epsilon: 0.066218140008522    steps: 123     evaluation reward: 0.78
episode: 2926   score: 0.0   memory length: 521728   epsilon: 0.06597658000852216    steps: 122     evaluation reward: 0.78
episode: 2927   score: 0.0   memory length: 521851   epsilon: 0.06573304000852233    steps: 123     evaluation reward: 0.78
episode: 2928   score: 2.0   memory length: 522034   epsilon: 0.06537070000852258    steps: 183     evaluation reward: 0.8
episode: 2929   

episode: 2988   score: 0.0   memory length: 530775   epsilon: 0.04806352000853438    steps: 122     evaluation reward: 0.72
episode: 2989   score: 0.0   memory length: 530897   epsilon: 0.047821960008534545    steps: 122     evaluation reward: 0.72
episode: 2990   score: 1.0   memory length: 531066   epsilon: 0.04748734000853477    steps: 169     evaluation reward: 0.71
episode: 2991   score: 0.0   memory length: 531188   epsilon: 0.04724578000853494    steps: 122     evaluation reward: 0.7
episode: 2992   score: 0.0   memory length: 531311   epsilon: 0.047002240008535104    steps: 123     evaluation reward: 0.7
episode: 2993   score: 0.0   memory length: 531433   epsilon: 0.04676068000853527    steps: 122     evaluation reward: 0.68
episode: 2994   score: 0.0   memory length: 531555   epsilon: 0.04651912000853543    steps: 122     evaluation reward: 0.68
episode: 2995   score: 1.0   memory length: 531705   epsilon: 0.046222120008535636    steps: 150     evaluation reward: 0.69
episode