In [1]:
#!/usr/bin/env python
from __future__ import print_function


import os
##disable graphics for cloud computing
os.putenv('SDL_VIDEODRIVER', 'fbcon')
os.environ["SDL_VIDEODRIVER"] = "dummy"
import argparse
import skimage as skimage
from skimage import transform, color, exposure
from skimage.transform import rotate
from skimage.viewer import ImageViewer
import sys
sys.path.append("game/")
import wrapped_flappy_bird as game
import random
import numpy as np
from collections import deque

  warn("Recommended matplotlib backend is `Agg` for full "


In [2]:

import json
from keras import initializers
from keras.initializers import normal, identity
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD , Adam
import tensorflow as tf

Using TensorFlow backend.


In [3]:
GAME = 'bird' # the name of the game being played for log files
CONFIG = 'nothreshold'
ACTIONS = 2 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVATION = 3200. # timesteps to observe before training
EXPLORE = 1000#3000000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
REPLAY_MEMORY = 100#50000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
FRAME_PER_ACTION = 1
LEARNING_RATE = 1e-4

In [4]:
img_rows , img_cols = 80, 80
#Convert image into Black and white
img_channels = 4 #We stack 4 frames

def buildmodel():
    print("Now we build the model")
    model = Sequential()
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same',input_shape=(img_rows,img_cols,img_channels)))  #80*80*4
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(2))
   
    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    print("We finish building the model")
    return model

In [27]:
def trainNetwork(model,args='train'):
    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)

    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t,(80,80))
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    #print (s_t.shape)

    #In Keras, need to reshape
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*80*80*4

    

    if args == 'Run':
        print('running with saved model')
        OBSERVE = 999999999    #We keep observe, never train
        epsilon = FINAL_EPSILON
        print ("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        print ("Weight load successfully")    
    else:                       #We go to training mode
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON

    t = 0
    while (True):
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0
        a_t = np.zeros([ACTIONS])
        #choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                q = model.predict(s_t)       #input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index = max_Q
                a_t[max_Q] = 1

        #We reduced the epsilon gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        #run the selected action and observed next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)

        x_t1 = skimage.color.rgb2gray(x_t1_colored)
        x_t1 = skimage.transform.resize(x_t1,(80,80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        #only train if done observing
        if t > OBSERVE:
            #sample a minibatch to train on
            minibatch = random.sample(D, BATCH)



            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 80, 80, 4
            print (inputs.shape)
            targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2

            #Now we do the experience replay
            for i in range(0, len(minibatch)):
                state_t = minibatch[i][0]
                action_t = minibatch[i][1]   #This is action index
                reward_t = minibatch[i][2]
                state_t1 = minibatch[i][3]
                terminal = minibatch[i][4]
                # if terminated, only equals reward

                inputs[i:i + 1] = state_t    #I saved down s_t

                targets[i] = model.predict(state_t)  # Hitting each buttom probability
                Q_sa = model.predict(state_t1)

                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            # targets2 = normalize(targets)
            loss += model.train_on_batch(inputs, targets)

        s_t = s_t1
        t = t + 1

        # save progress every 10000 iterations
        if t % 1000 == 0:
            print("Now we save model")
            model.save_weights("model.h5", overwrite=True)
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

    print("Episode finished!")
    print("************************")

In [25]:
def playGame(args='train'):
    model = buildmodel()
    trainNetwork(model,args)

def main():
    parser = argparse.ArgumentParser(description='Description of your program')
    parser.add_argument('-m','--mode', help='Train / Run', required=True)
    args = vars(parser.parse_args())
    playGame(args)

In [24]:
playGame()

Now we build the model
We finish building the model


  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


TIMESTEP 1 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 3 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 4 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 5 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 6 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 7 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 8 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 9 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 10 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 11 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_M

TIMESTEP 94 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 95 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 96 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 97 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 98 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 99 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 100 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 101 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 102 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 103 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 104 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMES

TIMESTEP 186 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 187 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 188 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 189 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 190 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 191 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 192 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 193 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 194 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 195 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 196 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0

TIMESTEP 279 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 280 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 281 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 282 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 283 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 284 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 285 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 286 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 287 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 288 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 289 / STATE observe / EPSILON 0.1 / ACTION 

TIMESTEP 372 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 373 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 374 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 375 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 376 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 377 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 378 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 379 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 380 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 381 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 382 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  

TIMESTEP 465 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 466 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 467 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 468 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 469 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 470 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 471 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 472 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 473 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 474 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 475 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  

TIMESTEP 558 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 559 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 560 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 561 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 562 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 563 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 564 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 565 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 566 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 567 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 568 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  

TIMESTEP 651 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 652 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 653 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 654 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 655 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 656 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 657 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 658 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 659 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 660 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 661 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  

TIMESTEP 745 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 746 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 747 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 748 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 749 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 750 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 751 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 752 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 753 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 754 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 755 / STA

TIMESTEP 837 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 838 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 839 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 840 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 841 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 842 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 843 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 844 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 845 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 846 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 847 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0

TIMESTEP 934 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 935 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 936 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 937 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 938 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 939 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 940 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 941 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 942 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 943 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 944 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 945 / STATE observe / E

TIMESTEP 1024 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1025 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1026 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1027 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1028 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1029 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1030 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1031 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1032 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1033 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1034 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1035 / STATE

TIMESTEP 1117 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1118 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1119 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1120 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1121 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1122 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1123 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1124 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1125 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1126 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1127 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1128 / STATE

TIMESTEP 1209 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1210 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1211 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1212 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1213 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1214 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1215 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1216 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1217 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1218 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1219 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1220 / STATE 

TIMESTEP 1300 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1301 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1302 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1303 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1304 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1305 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1306 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1307 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1308 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1309 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1310 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  

TIMESTEP 1393 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1394 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1395 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1396 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1397 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1398 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1399 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1400 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1401 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1402 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1403 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 1485 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1486 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1487 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1488 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1489 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1490 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1491 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1492 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1493 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1494 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1495 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1496 / STATE 

TIMESTEP 1579 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1580 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1581 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1582 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1583 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1584 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1585 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1586 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1587 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1588 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1589 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1590 / STATE

TIMESTEP 1674 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1675 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1676 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1677 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1678 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1679 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1680 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1681 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1682 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1683 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 

TIMESTEP 1765 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1766 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1767 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 1768 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1769 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1770 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1771 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1772 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1773 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1774 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1775 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 1858 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1859 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1860 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1861 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1862 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1863 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1864 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1865 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1866 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1867 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1868 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1869 / STATE

TIMESTEP 1949 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1950 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1951 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1952 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1953 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1954 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1955 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1956 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1957 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1958 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1959 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1960 / STATE

TIMESTEP 2040 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2041 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2042 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2043 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2044 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2045 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2046 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2047 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2048 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2049 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2050 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2051 / STATE

TIMESTEP 2125 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2126 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2127 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2128 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2129 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2130 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2131 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2132 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2133 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2134 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2135 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2136 / STATE

TIMESTEP 2218 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2219 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2220 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2221 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 2222 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2223 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2224 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2225 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2226 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2227 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2228 / STATE observe / EPSILON 0.1

TIMESTEP 2311 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2312 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2313 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2314 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2315 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2316 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2317 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2318 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2319 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2320 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2321 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2322 / STATE

TIMESTEP 2399 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2400 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2401 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2402 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2403 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2404 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2405 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2406 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2407 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2408 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2409 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Acti

TIMESTEP 2491 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2492 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2493 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2494 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2495 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2496 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2497 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2498 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2499 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2500 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2501 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2502 / STATE

TIMESTEP 2584 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2585 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2586 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2587 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2588 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 2589 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2590 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2591 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2592 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2593 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2594 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  

TIMESTEP 2679 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2680 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2681 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2682 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2683 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2684 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2685 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2686 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2687 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2688 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2689 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2690 / STATE

TIMESTEP 2769 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2770 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2771 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2772 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2773 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2774 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2775 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2776 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2777 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2778 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2779 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2780 / STATE

TIMESTEP 2862 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2863 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2864 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2865 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2866 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2867 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2868 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2869 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2870 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2871 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2872 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 2948 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2949 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2950 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2951 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2952 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2953 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2954 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2955 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 2956 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2957 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2958 / STATE observe / EPSILON 0.

TIMESTEP 3039 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3040 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 3041 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3042 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3043 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3044 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3045 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3046 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3047 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3048 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3049 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 3132 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3133 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3134 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3135 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3136 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3137 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3138 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
TIMESTEP 3139 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3140 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3141 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3142 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 3218 / STATE explore / EPSILON 0.0983017 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3219 / STATE explore / EPSILON 0.0982018 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3220 / STATE explore / EPSILON 0.0981019 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3221 / STATE explore / EPSILON 0.098002 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3222 / STATE explore / EPSILON 0.0979021 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3223 / STATE explore / EPSILON 0.0978022 / ACTION 1 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3224 / STATE explore / EPSILON 0.0977023 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3225 / STATE explore / EPSILON 0.0976024 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3226 / STATE explore / EPSILON 0.09750

TIMESTEP 3288 / STATE explore / EPSILON 0.0913087 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3289 / STATE explore / EPSILON 0.0912088 / ACTION 1 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3290 / STATE explore / EPSILON 0.0911089 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3291 / STATE explore / EPSILON 0.091009 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3292 / STATE explore / EPSILON 0.0909091 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3293 / STATE explore / EPSILON 0.0908092 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3294 / STATE explore / EPSILON 0.0907093 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3295 / STATE explore / EPSILON 0.0906094 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3296 / STATE explore / EPSILON 0.09050

TIMESTEP 3357 / STATE explore / EPSILON 0.0844156 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3358 / STATE explore / EPSILON 0.0843157 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3359 / STATE explore / EPSILON 0.0842158 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3360 / STATE explore / EPSILON 0.0841159 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3361 / STATE explore / EPSILON 0.084016 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3362 / STATE explore / EPSILON 0.0839161 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3363 / STATE explore / EPSILON 0.0838162 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3364 / STATE explore / EPSILON 0.0837163 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3365

TIMESTEP 3426 / STATE explore / EPSILON 0.0775225 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3427 / STATE explore / EPSILON 0.0774226 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3428 / STATE explore / EPSILON 0.0773227 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3429 / STATE explore / EPSILON 0.0772228 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3430 / STATE explore / EPSILON 0.0771229 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3431 / STATE explore / EPSILON 0.077023 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3432 / STATE explore / EPSILON 0.0769231 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3433 / STATE explore / EPSILON 0.0768232 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3434 / STATE explore / EPSILON 0.0767233 / ACTION 0 / REWARD 0.1 / Q_MAX

TIMESTEP 3497 / STATE explore / EPSILON 0.0704296 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3498 / STATE explore / EPSILON 0.0703297 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3499 / STATE explore / EPSILON 0.0702298 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3500 / STATE explore / EPSILON 0.0701299 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3501 / STATE explore / EPSILON 0.07003 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3502 / STATE explore / EPSILON 0.0699301 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3503 / STATE explore / EPSILON 0.0698302 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3504 / STATE explore / EPSILON 0.0697303 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3505 / STATE explore / EPSILON 0.069630

TIMESTEP 3568 / STATE explore / EPSILON 0.0633367 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3569 / STATE explore / EPSILON 0.0632368 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3570 / STATE explore / EPSILON 0.0631369 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3571 / STATE explore / EPSILON 0.063037 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3572 / STATE explore / EPSILON 0.0629371 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3573 / STATE explore / EPSILON 0.0628372 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3574 / STATE explore / EPSILON 0.0627373 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3575 / STATE explore / EPSILON 0.0626374 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3576 / STATE explore / EPSILON 0.0625375 / ACTION 0 / REWARD -1 / Q_MAX 

TIMESTEP 3640 / STATE explore / EPSILON 0.0561439 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3641 / STATE explore / EPSILON 0.056044 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3642 / STATE explore / EPSILON 0.0559441 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3643 / STATE explore / EPSILON 0.0558442 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3644 / STATE explore / EPSILON 0.0557443 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3645 / STATE explore / EPSILON 0.0556444 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3646 / STATE explore / EPSILON 0.0555445 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3647 / STATE explore / EPSILON 0.0554446 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
----------Random Action----------
(32, 80, 80, 4)
TIMESTEP 3648

TIMESTEP 3709 / STATE explore / EPSILON 0.0492508 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3710 / STATE explore / EPSILON 0.0491509 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3711 / STATE explore / EPSILON 0.049051 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3712 / STATE explore / EPSILON 0.0489511 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3713 / STATE explore / EPSILON 0.0488512 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3714 / STATE explore / EPSILON 0.0487513 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3715 / STATE explore / EPSILON 0.0486514 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3716 / STATE explore / EPSILON 0.0485515 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3717 / STATE explore / EPSILON 0.0484516 / ACTION 0 / REWARD 0.1 / Q_MAX

TIMESTEP 3779 / STATE explore / EPSILON 0.0422578 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3780 / STATE explore / EPSILON 0.0421579 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3781 / STATE explore / EPSILON 0.042058 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3782 / STATE explore / EPSILON 0.0419581 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3783 / STATE explore / EPSILON 0.0418582 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3784 / STATE explore / EPSILON 0.0417583 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3785 / STATE explore / EPSILON 0.0416584 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3786 / STATE explore / EPSILON 0.0415585 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3787 / STATE explore / EPSILON 0.0414586 / ACTION 0 / REWARD 0.1 / Q_MAX

TIMESTEP 3850 / STATE explore / EPSILON 0.0351649 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3851 / STATE explore / EPSILON 0.035065 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3852 / STATE explore / EPSILON 0.0349651 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3853 / STATE explore / EPSILON 0.0348652 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3854 / STATE explore / EPSILON 0.0347653 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3855 / STATE explore / EPSILON 0.0346654 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3856 / STATE explore / EPSILON 0.0345655 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3857 / STATE explore / EPSILON 0.0344656 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3858 / STATE explore / EPSILON 0.0343657 / ACTION 0 / REWARD 0.1 / Q_MAX

TIMESTEP 3921 / STATE explore / EPSILON 0.028072 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3922 / STATE explore / EPSILON 0.0279721 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3923 / STATE explore / EPSILON 0.0278722 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3924 / STATE explore / EPSILON 0.0277723 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3925 / STATE explore / EPSILON 0.0276724 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3926 / STATE explore / EPSILON 0.0275725 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3927 / STATE explore / EPSILON 0.0274726 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3928 / STATE explore / EPSILON 0.0273727 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3929 / STATE explore / EPSILON 0.0272728 / ACTION 0 / REWARD 0.1 / Q_MAX

TIMESTEP 3993 / STATE explore / EPSILON 0.0208792 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3994 / STATE explore / EPSILON 0.0207793 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3995 / STATE explore / EPSILON 0.0206794 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3996 / STATE explore / EPSILON 0.0205795 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3997 / STATE explore / EPSILON 0.0204796 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3998 / STATE explore / EPSILON 0.0203797 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 3999 / STATE explore / EPSILON 0.0202798 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
Now we save model
TIMESTEP 4000 / STATE explore / EPSILON 0.0201799 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4001 / STATE explore / EPSILON 0.02008 / ACTION 0 / R

TIMESTEP 4065 / STATE explore / EPSILON 0.0136864 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4066 / STATE explore / EPSILON 0.0135865 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4067 / STATE explore / EPSILON 0.0134866 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4068 / STATE explore / EPSILON 0.0133867 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4069 / STATE explore / EPSILON 0.0132868 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4070 / STATE explore / EPSILON 0.0131869 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4071 / STATE explore / EPSILON 0.013087 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4072 / STATE explore / EPSILON 0.0129871 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4073 / STATE explore / EPSILON 0.0128872 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 4137 / STATE explore / EPSILON 0.0064936 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4138 / STATE explore / EPSILON 0.0063937 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4139 / STATE explore / EPSILON 0.0062938 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4140 / STATE explore / EPSILON 0.0061939 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4141 / STATE explore / EPSILON 0.006094 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4142 / STATE explore / EPSILON 0.0059941 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4143 / STATE explore / EPSILON 0.0058942 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4144 / STATE explore / EPSILON 0.0057943 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4145 / STATE explore / EPSILON 0.0056944 / ACTION 0 / REWARD 0.1 / Q_MAX 

TIMESTEP 4209 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4210 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4211 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4212 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4213 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4214 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4215 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4216 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4217 / STATE train / EP

TIMESTEP 4277 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4278 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4279 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4280 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4281 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4282 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4283 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4284 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4285 / STATE train / EPS

TIMESTEP 4345 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4346 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4347 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4348 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4349 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4350 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4351 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4352 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4353 / STATE train / EP

TIMESTEP 4413 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4414 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4415 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4416 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4417 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4418 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4419 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4420 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4421 / STATE train / EP

TIMESTEP 4481 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4482 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4483 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4484 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4485 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4486 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4487 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4488 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4489 / STATE train / EPS

TIMESTEP 4549 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4550 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4551 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4552 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4553 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4554 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4555 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4556 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4557 / STATE train / EP

TIMESTEP 4617 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4618 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4619 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4620 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4621 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4622 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4623 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4624 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4625 / STATE train / EPS

TIMESTEP 4685 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4686 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4687 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4688 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4689 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4690 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4691 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4692 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4693 / STATE train / EP

TIMESTEP 4753 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4754 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4755 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4756 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4757 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4758 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4759 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4760 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4761 / STATE train / EP

TIMESTEP 4821 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4822 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4823 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4824 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4825 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4826 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4827 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4828 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4829 / STATE train / EPS

TIMESTEP 4889 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4890 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4891 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4892 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4893 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4894 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4895 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4896 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4897 / STATE train / EP

TIMESTEP 4957 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4958 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4959 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4960 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4961 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4962 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4963 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4964 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 4965 / STATE train / EPS

TIMESTEP 5025 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5026 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5027 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5028 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5029 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5030 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5031 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5032 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5033 / STATE train / EP

TIMESTEP 5093 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5094 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD -1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5095 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5096 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5097 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5098 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5099 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5100 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5101 / STATE train / EPS

TIMESTEP 5161 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5162 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5163 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5164 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5165 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5166 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5167 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5168 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5169 / STATE train / EP

TIMESTEP 5229 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5230 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5231 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5232 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5233 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5234 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5235 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5236 / STATE train / EPSILON 1.00000000017e-07 / ACTION 0 / REWARD 0.1 / Q_MAX  nan / Loss  nan
(32, 80, 80, 4)
TIMESTEP 5237 / STATE train / EP

KeyboardInterrupt: 

In [28]:
playGame(args='Run')

Now we build the model
We finish building the model

  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':



running with saved model
Now we load weight
Weight load successfully
TIMESTEP 1 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 2 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 3 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 4 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 5 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 6 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 7 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 8 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 9 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 10 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 11 / STATE observe / EPSILON 0

TIMESTEP 95 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 96 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 97 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 98 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 99 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 100 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 101 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 102 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 103 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 104 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 105 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIME

TIMESTEP 187 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 188 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 189 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 190 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 191 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 192 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 193 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 194 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 195 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 196 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 197 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 279 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 280 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 281 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 282 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 283 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 284 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 285 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 286 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 287 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 288 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 289 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 370 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 371 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 372 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 373 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 374 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 375 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 376 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 377 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 378 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 379 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 380 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 462 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 463 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 464 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 465 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 466 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 467 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 468 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 469 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 470 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 471 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 472 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0

TIMESTEP 556 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 557 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 558 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 559 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 560 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 561 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 562 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 563 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 564 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 565 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 566 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0

TIMESTEP 647 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 648 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 649 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 650 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 651 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 652 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 653 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 654 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 655 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 656 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 657 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0

TIMESTEP 739 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 740 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 741 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 742 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 743 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 744 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 745 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 746 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 747 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 748 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 749 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 830 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 831 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 832 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 833 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 834 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 835 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 836 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 837 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 838 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 839 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 840 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 923 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 924 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 925 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 926 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 927 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 928 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 929 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 930 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 931 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 932 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 933 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 1015 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1016 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1017 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1018 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1019 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1020 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1021 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1022 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1023 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1024 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1025 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0

TIMESTEP 1108 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1109 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1110 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1111 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1112 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1113 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1114 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1115 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1116 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1117 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1118 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  

TIMESTEP 1199 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1200 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1201 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1202 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1203 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1204 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1205 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1206 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1207 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1208 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1209 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  

TIMESTEP 1291 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1292 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1293 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1294 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1295 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1296 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1297 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1298 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1299 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1300 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1301 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0

TIMESTEP 1384 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1385 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1386 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1387 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1388 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1389 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1390 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1391 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1392 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1393 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1394 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0

TIMESTEP 1477 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1478 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1479 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1480 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1481 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1482 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1483 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1484 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1485 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1486 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1487 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0

TIMESTEP 1568 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1569 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1570 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1571 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1572 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1573 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1574 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1575 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1576 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD -1 / Q_MAX  0 / Loss  0
TIMESTEP 1577 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1578 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0

TIMESTEP 1659 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1660 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1661 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1662 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1663 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1664 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1665 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1666 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1667 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1668 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1669 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  

TIMESTEP 1751 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1752 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1753 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1754 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1755 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1756 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1757 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1758 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1759 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1760 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 1761 / STATE observe / EPSILON 0.0001 / ACTION 0 / REWARD 0.1 / Q_MAX  

KeyboardInterrupt: 

In [7]:


if __name__ == "__main__":
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    main()


usage: ipykernel_launcher.py [-h] -m MODE
ipykernel_launcher.py: error: argument -m/--mode is required


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
