In [4]:
import gym
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers

import baselines.common.tf_util as U

from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import PrioritizedReplayBuffer,ReplayBuffer
from baselines.common.schedules import LinearSchedule
from baselines.deepq.models import cnn_to_mlp

In [5]:
import scipy.misc as scimisc
import gym_minecraft
from MinecraftGym import MinecraftWrapper

In [6]:
from keras import backend as K
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Activation,GRU,Input,LSTM,Conv2D,Flatten
from keras.optimizers import RMSprop,Adam
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [7]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
from IPython.display import clear_output,display
output_notebook()

In [8]:
from bokeh.client import push_session
from bokeh.driving import cosine
from bokeh.plotting import figure, curdoc

In [9]:
from hyperdash import Experiment

In [25]:
pre_env = gym.make("MinecraftCliffWalking1-v0")
pre_env.init(videoResolution=[400,400],allowContinuousMovement=["move", "turn", "strafe"],observeGrid=[20,-1,20,20,-1,20],observeDistance=[4,45,12])
env = MinecraftWrapper(pre_env,1/10,(41,41))

In [None]:
def proc2_reward(info):
    if info is None:
        return 0
    elif 'observation' not in info.keys():
        return 0
    elif info['observation'] is None:
        return 0
    elif 'distanceFromdist' in info['observation'].keys():
        return 10/(0.001 + info['observation']['distanceFromdist'])
    else:
        return 0

In [42]:
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        
        model = Sequential()
        model.add(Conv2D(32,(8,8),input_shape=env.observation_space.shape,activation='relu'))
        model.add(Conv2D(64,(4,4),activation='relu'))
        model.add(Conv2D(64,(3,3),activation='relu'))
        model.add(Flatten())
        model.add(Dense(256,activation='relu')
        model.add(Dense(output_dim=num_actions,activation='softmax'))
        
        
        '''
        a = Conv2D(32,(8,8),input_shape=env.observation_space.shape,activation='relu')(inpt)
        b = Conv2D(64,(3,3),activation='relu')(a)
        d = Flatten()(b)
        c = Dense(output_dim=num_actions,activation='softmax')(d)
        '''
        return model(inpt)

In [None]:
def old_model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt

        out =  tf.layers.conv2d(
                inputs=out,
                filters=32,
                kernel_size=[8, 8],
                padding="same",
                activation=tf.nn.relu)
        out =  tf.layers.conv2d(
                inputs=out,
                filters=64,
                kernel_size=[4, 4],
                padding="same",
                activation=tf.nn.relu)
        out2 = tf.contrib.layers.flatten(out)
        out2 = tf.layers.dense(inputs=out2, units=64)
        out2 = tf.layers.dense(inputs=out2, units=num_actions)
    return out2

In [10]:
def lib_model(inpt,num_action,scope,reuse = False):
    with tf.variable_scope(scope,reuse = reuse):
        return cnn_to_mlp([(32,8,4),(64,4,2),(64,3,1)],[256],True,True)

In [None]:
U.reset()
exp  = Experiment("openAI DQN - Minecraft")
exp.param("notes","using MLP CNN 32,8x8,s4 - 64,4x4,s2 - 64,3x3,s1")
#env = gym.make('SpaceInvaders-v0')

with U.make_session(2) as sess:
    # Create all the functions necessary to train the model
    K.set_session(sess)
    dqn_model = cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True
    )
    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
        q_func= model,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
    )
    
    # Create the replay buffer
    replay_buffer = ReplayBuffer(1000000)
    # Create the schedule for exploration starting from 1 (every action is random) down to
    # 0.02 (98% of actions are selected according to values predicted by the model).
    exploration = LinearSchedule(schedule_timesteps=1000, initial_p=1.0, final_p=0.02)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    obs = env.reset()
    for t in itertools.count():
        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, info = env.step(action)
        #rew += proc2_reward(info)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            exp.metric("reward",episode_rewards[-1])
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
        if is_solved:
            # Show off the result
            #env.render()
            #print("Solved")
            pass
        else:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if t > 100:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
            # Update target network periodically.
            if t % 1000 == 0:
                update_target()

        if done and len(episode_rewards) % 10 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", len(episode_rewards))
            logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
            logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
            logger.dump_tabular()

{ notes: using MLP CNN 32,8x8,s4 - 64,4x4,s2 - 64,3x3,s1 }
Instructions for updating:
Use `argmax` instead


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


| reward: 110.000000 |
| reward: 150.000000 |
| reward:  75.000000 |
| reward: 185.000000 |
| reward: 135.000000 |
| reward: 120.000000 |
| reward: 370.000000 |
| reward:  55.000000 |
| reward: 125.000000 |
-------------------------------------
| % time spent exploring | 43       |
| episodes               | 10       |
| mean episode reward    | 147      |
| steps                  | 5769     |
-------------------------------------
| reward: 125.000000 |
| reward: 175.000000 |
| reward: 105.000000 |
| reward:  85.000000 |
| reward: 110.000000 |
| reward: 225.000000 |


In [14]:
def preprocess(rgb_array,scale = 1/12):
    frame_shape = rgb_array.shape
    
    frame = np.array(rgb_array)
    gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((frame_shape[0],frame_shape[1]))
    smaller = scimisc.imresize(gray_frame,scale,mode='L').astype('float64')
    smaller /= 255.0
    smaller = np.expand_dims(smaller,2) # convert to a 3D array of shape (height,width,grayscale)
    #smaller = np.reshape(smaller, [1, *(smaller.shape)])
    return smaller.astype("uint8")

In [20]:
def train(env,exp,model,buffer_size,epsilon,learning_rate,preprocessor = None):
    U.reset()
    #exp.param("notes","using MLP CNN 32,8x8,s4 - 64,4x4,s2 - 64,3x3,s1")
    

    with U.make_session(2) as sess:
        K.set_session(sess)
        try:
            # Create all the functions necessary to train the model
            
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
                q_func= model,
                num_actions=env.action_space.n,
                optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
            )
    
            # Create the replay buffer
            replay_buffer = ReplayBuffer(buffer_size)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            exploration = LinearSchedule(schedule_timesteps=1000, initial_p=1.0, final_p=0.02)

            # Initialize the parameters and copy them to the target network.
            U.initialize()
            update_target()
            
            #

            episode_rewards = [0.0]
            obs = env.reset()
            if preprocessor:
                obs = preprocessor(obs)
            for t in itertools.count():
                # Take action and update exploration to the newest value
                action = act(obs[None], update_eps=exploration.value(t))[0]
                new_obs, rew, done, info = env.step(action)
                if preprocessor:
                    new_obs = preprocessor(new_obs)
                #rew += proc2_reward(info)
                # Store transition in the replay buffer.
                replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                episode_rewards[-1] += rew
                if done:
                    obs = env.reset()
                    if preprocessor:
                        obs = preprocessor(obs)
                    if exp:
                        exp.metric("reward",episode_rewards[-1])
                    episode_rewards.append(0)
                    
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 100:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

                if done and len(episode_rewards) % 10 == 0:
                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", len(episode_rewards))
                    logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                    logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                    logger.dump_tabular()
        except:
            if exp:
                exp.end()
            raise

In [None]:
env = gym.make('SpaceInvaders-v0')
exp = Experiment("OpenAI - DQN - SpaceInvaders")
train(env,exp,(42,32,1),5000)

In [13]:
pre_env = gym.make("MinecraftCliffWalking1-v0")
pre_env.init(videoResolution=[400,400],allowContinuousMovement=["move", "turn", "strafe"],observeGrid=[20,-1,20,20,-1,20],observeDistance=[4,45,12])
env = MinecraftWrapper(pre_env,1/10,(41,41))

exp = Experiment("OpenAI - DQN - SpaceInvaders")
learning_rate = 0.001
epsilon = 1
buffer_size = 1000000
exp.param("learning rate", "Adam Optimizer with lr = {}".format(learning_rate))
exp.param("buffer_size",buffer_size)

train(env,exp,env.observation_space.shape,5000,buffer_size,epsilon,learning_rate)



{ learning rate: Adam Optimizer with lr = 0.001 }
{ buffer_size: 1000000 }
This run of OpenAI - DQN - SpaceInvaders ran for 0:00:01 and logs are available locally at: /home/ubuntu/.hyperdash/logs/openai-dqn-spaceinvaders/openai-dqn-spaceinvaders_2018-01-01t01-58-52-295523.log


TypeError: 'tuple' object is not callable

In [22]:
pre_env = gym.make("MinecraftCliffWalking1-v0")
pre_env.init(videoResolution=[400,400],allowContinuousMovement=["move", "turn", "strafe"],observeGrid=[20,-1,20,20,-1,20],observeDistance=[4,45,12])
env = MinecraftWrapper(pre_env,1/5,(41,41))

exp = Experiment("OpenAI - DQN - CliffWalking")
exp.param("memory capacity",1000000)
model1 = cnn_to_mlp([(32,8,4),(64,4,2),(64,3,1)],[256],True,True)
train(env,exp,model1,1000000,1,0.001)

{ memory capacity: 1000000 }
Instructions for updating:
Use `argmax` instead
| reward: -93.392687 |
| reward: -96.212646 |
| reward: -95.283120 |
| reward: -93.379098 |
| reward: -96.221016 |
| reward: -95.259455 |
| reward: -95.241061 |
| reward: -96.228949 |
| reward: -96.221624 |
-------------------------------------
| % time spent exploring | 94       |
| episodes               | 10       |
| mean episode reward    | -95.3    |
| steps                  | 53       |
-------------------------------------
| reward: -91.468579 |
| reward: -89.517490 |
| reward: -91.450331 |
| reward: -96.213430 |
| reward: -97.159924 |
| reward: -93.393487 |
| reward: -92.374048 |
| reward: -95.273766 |
| reward: -95.284587 |
| reward: -91.489064 |
-------------------------------------
| % time spent exploring | 86       |
| episodes               | 20       |
| mean episode reward    | -94.3    |
| steps                  | 133      |
-------------------------------------
| reward: -95.248027 |
| rewar

Agent missed 1 observation(s).


| reward: -97.159643 |
| reward: -96.212986 |
| reward: -97.160033 |
| reward: -82.028122 |
| reward: -95.282201 |
| reward: -95.290948 |
| reward: -92.411133 |
| reward: -82.976105 |
| reward: -93.394595 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 130      |
| mean episode reward    | -93      |
| steps                  | 1029     |
-------------------------------------
| reward: -96.194989 |
| reward: -96.212986 |
| reward: -87.698419 |
| reward: -96.189375 |
| reward: -96.202733 |
| reward: -96.229138 |
| reward: -96.221624 |
| reward: -94.297903 |
| reward: -93.393901 |
| reward: -96.212089 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 140      |
| mean episode reward    | -93.5    |
| steps                  | 1093     |
-------------------------------------
| reward: -95.242437 |
| reward: -94.335694 |


Agent missed 1 observation(s).


| reward: -75.337390 |
| reward: -95.248122 |
| reward: -77.281815 |
| reward: -96.210662 |
| reward: -93.389761 |
| reward: -95.242437 |
| reward: -96.202733 |
| reward: -82.027067 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 150      |
| mean episode reward    | -93.3    |
| steps                  | 1207     |
-------------------------------------
| reward: -70.136269 |
| reward: -91.473884 |
| reward: -94.338413 |
| reward: -27.128781 |
| reward: -97.159643 |
| reward: -63.516298 |
| reward: -93.396688 |
| reward: -96.202871 |
| reward: -82.973017 |
| reward: -96.212218 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 160      |
| mean episode reward    | -92.1    |
| steps                  | 1382     |
-------------------------------------
| reward: -92.440144 |
| reward: -90.522276 |
| reward: -89.592352 |
| reward: -96.210781 |
| reward: -97.159643 |
| reward: -95.2

Agent missed 1 observation(s).


| reward: -95.242437 |
| reward: -74.451279 |
| reward: -96.202733 |
| reward: -93.391240 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 170      |
| mean episode reward    | -92.1    |
| steps                  | 1476     |
-------------------------------------
| reward: -36.224939 |
| reward: -63.861005 |
| reward: -96.211805 |
| reward: -94.334628 |
| reward: -96.221624 |
| reward: -91.462120 |
| reward: -89.592353 |
| reward: -75.781409 |
| reward: -83.914721 |
| reward: -96.202732 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 180      |
| mean episode reward    | -90.8    |
| steps                  | 1659     |
-------------------------------------
| reward: -19.283428 |
| reward: -84.856187 |
| reward: -56.482662 |
| reward: -92.442628 |
| reward: -16.692779 |
| reward: -93.397692 |
| reward: -90.476793 |
| reward: -86.751222 |
| reward: -96.212986 |
| reward: -62.8

Agent missed 1 observation(s).


| reward: -93.395828 |
| reward: -56.454177 |
| reward: -96.213965 |
| reward: -79.647788 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 210      |
| mean episode reward    | -86.4    |
| steps                  | 2305     |
-------------------------------------
| reward: -75.359492 |
| reward: -95.265032 |
| reward: -96.212986 |
| reward: -97.159924 |
| reward: -96.214386 |
| reward: -63.065010 |
| reward: -70.495609 |
| reward: -97.159643 |
| reward: -95.287629 |
| reward: -95.284868 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 220      |
| mean episode reward    | -85.9    |
| steps                  | 2437     |
-------------------------------------
| reward: -87.681307 |
| reward: -94.331267 |


Agent missed 1 observation(s).


| reward: -77.734025 |
| reward: -97.159643 |
| reward: -92.424885 |
| reward: -74.410236 |
| reward: -94.338197 |
| reward: -50.153506 |
| reward: -59.302165 |
| reward: -71.877717 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 230      |
| mean episode reward    | -84.6    |
| steps                  | 2655     |
-------------------------------------
| reward: -97.159643 |
| reward: -97.159643 |
| reward: -97.159378 |
| reward: -94.291957 |
| reward: -91.500706 |
| reward: -89.596872 |
| reward: -94.291659 |


Agent missed 1 observation(s).


| reward: -95.286549 |


Agent missed 1 observation(s).


| reward: -89.576085 |
| reward: -95.242437 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 240      |
| mean episode reward    | -84.5    |
| steps                  | 2727     |
-------------------------------------
| reward: -95.239360 |
| reward: -75.870224 |
| reward: -96.215822 |
| reward: -90.542917 |
| reward: -95.286268 |
| reward: -94.295675 |
| reward: -81.387713 |
| reward: -96.208457 |
| reward: -97.160108 |
| reward: -84.852444 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 250      |
| mean episode reward    | -84.6    |
| steps                  | 2832     |
-------------------------------------
| reward: -73.887680 |
| reward: -95.250396 |
| reward: -91.441993 |
| reward: -61.155897 |
| reward: -78.866059 |
| reward: -74.632795 |
| reward: -93.388795 |
| reward: -82.108288 |
| reward: -95.286927 |
| reward: -93.392037 |
-------------------------------------


Agent missed 1 observation(s).


| reward: -96.194981 |
| reward: -92.404498 |
| reward: -95.288046 |
| reward: -67.774702 |
| reward: -94.346176 |
| reward: -95.277854 |
| reward: -96.211198 |
| reward: -89.604861 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 310      |
| mean episode reward    | -88.9    |
| steps                  | 3521     |
-------------------------------------
| reward: -94.302408 |
| reward: -96.202733 |
| reward: -95.284868 |
| reward: -89.603564 |
| reward: -95.243117 |
| reward: -87.773548 |
| reward: -88.656730 |
| reward: -95.282168 |
| reward: -91.464216 |
| reward: -85.774914 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 320      |
| mean episode reward    | -89.3    |
| steps                  | 3616     |
-------------------------------------
| reward: -95.243390 |
| reward: -96.202726 |
| reward: -90.508454 |
| reward: -95.247920 |
| reward: -81.589344 |
| reward: -95.2

Agent missed 1 observation(s).


| reward: -95.287629 |
| reward: -91.454481 |
| reward: -86.777836 |
| reward: -59.298317 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 340      |
| mean episode reward    | -90.2    |
| steps                  | 3803     |
-------------------------------------
| reward: -94.297602 |
| reward: -92.460745 |
| reward: -95.287629 |
| reward: -95.279423 |
| reward: -93.394355 |
| reward: -78.954650 |
| reward: -96.208738 |
| reward: -94.292803 |
| reward: -94.293818 |


Agent missed 1 observation(s).


| reward: -95.286268 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 350      |
| mean episode reward    | -90.5    |
| steps                  | 3886     |
-------------------------------------
| reward: -91.502012 |
| reward: -94.338013 |
| reward: -94.293544 |
| reward: -94.337983 |
| reward: -95.282201 |
| reward: -91.454090 |
| reward: -86.471991 |
| reward: -95.242445 |


Agent missed 1 observation(s).


| reward: -85.831343 |
| reward: -93.325762 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 360      |
| mean episode reward    | -91.3    |
| steps                  | 3978     |
-------------------------------------
| reward: -94.339824 |
| reward: -95.270300 |
| reward: -94.336644 |
| reward: -95.280189 |
| reward: -96.229078 |
| reward: -92.431507 |
| reward: -96.215267 |
| reward: -96.208415 |
| reward: -95.242436 |
| reward: -92.437402 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 370      |
| mean episode reward    | -91.9    |
| steps                  | 4043     |
-------------------------------------
| reward: -91.462514 |
| reward: -95.248731 |
| reward: -82.983799 |
| reward: -95.242508 |
| reward: -88.634367 |
| reward: -91.509299 |
| reward: -96.212218 |
| reward: -74.650885 |
| reward: -96.189375 |


Agent missed 1 observation(s).


| reward: -95.242757 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 380      |
| mean episode reward    | -92.5    |
| steps                  | 4150     |
-------------------------------------
| reward: -90.548170 |
| reward: -88.645333 |
| reward: -96.229138 |
| reward: -96.221624 |
| reward: -96.228936 |
| reward: -94.296943 |
| reward: -68.163648 |
| reward: -81.076402 |
| reward: -92.417790 |
| reward: -95.287629 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 390      |
| mean episode reward    | -92      |
| steps                  | 4264     |
-------------------------------------
| reward: -88.625270 |
| reward: -82.482893 |
| reward: -93.366183 |
| reward: -94.338986 |
| reward: -72.137070 |
| reward: -95.282149 |
| reward: -90.551184 |
| reward: -90.597715 |
| reward: -93.389073 |
| reward: -97.159924 |
-------------------------------------
| % time spent explorin

Agent missed 1 observation(s).


| reward: -92.410512 |
| reward: -86.773189 |
| reward: -95.242758 |
| reward: -92.440408 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 440      |
| mean episode reward    | -92.3    |
| steps                  | 4705     |
-------------------------------------
| reward: -93.387663 |
| reward: -94.320054 |
| reward: -81.212466 |
| reward: -91.463264 |
| reward: -97.159643 |
| reward: -76.251135 |
| reward: -95.254800 |
| reward: -96.194990 |
| reward: -95.280903 |
| reward: -96.220915 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 450      |
| mean episode reward    | -92.2    |
| steps                  | 4801     |
-------------------------------------
| reward: -96.212986 |
| reward: -94.293140 |
| reward: -92.437347 |
| reward: -91.500050 |
| reward: -93.386328 |
| reward: -96.189719 |
| reward: -97.159100 |
| reward: -95.287629 |
| reward: -95.287584 |


Agent missed 1 observation(s).


| reward: -93.386730 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 460      |
| mean episode reward    | -92.4    |
| steps                  | 4869     |
-------------------------------------
| reward: -92.407633 |
| reward: -95.242437 |
| reward: -96.194989 |
| reward: -95.248418 |
| reward: -94.335556 |
| reward: -95.282201 |
| reward: -91.496949 |
| reward: -95.239985 |
| reward: -94.337181 |
| reward: -94.322768 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 470      |
| mean episode reward    | -92.4    |
| steps                  | 4938     |
-------------------------------------
| reward: -93.351260 |
| reward: -95.242437 |


Agent missed 1 observation(s).


| reward: -96.194989 |
| reward: -96.221624 |
| reward: -95.287623 |
| reward: -95.247920 |
| reward: -93.374506 |
| reward: -96.234567 |
| reward: -97.159643 |
| reward: -97.159924 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 480      |
| mean episode reward    | -92.9    |
| steps                  | 4995     |
-------------------------------------
| reward: -92.436192 |
| reward: -88.658578 |
| reward: -90.528466 |
| reward: -96.213143 |
| reward: -94.338378 |
| reward: -94.336533 |
| reward: -81.133265 |
| reward: -84.850652 |
| reward: -88.590386 |
| reward: -89.597650 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 490      |
| mean episode reward    | -92.9    |
| steps                  | 5109     |
-------------------------------------
| reward: -94.335248 |
| reward: -91.491813 |
| reward: -95.248104 |
| reward: -90.522412 |
| reward: -93.333531 |
| reward: -89.6

Agent missed 1 observation(s).


| reward: -95.242567 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 520      |
| mean episode reward    | -93.6    |
| steps                  | 5329     |
-------------------------------------
| reward: -95.287527 |
| reward: -95.275330 |
| reward: -90.547027 |
| reward: -81.454873 |
| reward: -96.195107 |
| reward: -93.367224 |
| reward: -97.159643 |
| reward: -90.555376 |
| reward: -95.270550 |
| reward: -96.212218 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 530      |
| mean episode reward    | -93.4    |
| steps                  | 5411     |
-------------------------------------
| reward: -86.720067 |
| reward: -94.296581 |
| reward: -97.159643 |
| reward: -92.410722 |
| reward: -95.244802 |
| reward: -95.243185 |
| reward: -91.404901 |
| reward: -93.376262 |
| reward: -96.214197 |
| reward: -92.467092 |
-------------------------------------
| % time spent explorin

Agent missed 1 observation(s).


| reward: -96.229076 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 560      |
| mean episode reward    | -93.6    |
| steps                  | 5641     |
-------------------------------------
| reward: -93.399531 |
| reward: -84.497727 |
| reward: -89.602393 |
| reward: -94.335286 |
| reward: -94.289925 |
| reward: -95.242585 |


Agent missed 1 observation(s).


| reward: -95.243230 |
| reward: -97.159643 |
| reward: -97.159924 |
| reward: -96.210715 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 570      |
| mean episode reward    | -93.5    |
| steps                  | 5717     |
-------------------------------------
| reward: -93.393324 |
| reward: -82.950845 |
| reward: -95.269107 |
| reward: -94.306693 |
| reward: -87.685424 |
| reward: -91.462619 |
| reward: -94.341788 |
| reward: -91.442291 |
| reward: -95.284868 |
| reward: -95.290146 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 580      |
| mean episode reward    | -93.2    |
| steps                  | 5810     |
-------------------------------------
| reward: -89.630930 |
| reward: -96.208738 |
| reward: -96.215847 |
| reward: -97.159924 |
| reward: -96.212465 |
| reward: -94.303587 |
| reward: -94.342521 |
| reward: -56.154524 |
| reward: -92.468978 |
| reward: -95.2

Agent missed 1 observation(s).


| reward: -95.284868 |
| reward: -94.323781 |
| reward: -95.287629 |
| reward: -95.286777 |
| reward: -97.159643 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 630      |
| mean episode reward    | -93.6    |
| steps                  | 6183     |
-------------------------------------
| reward: -86.793511 |
| reward: -96.212440 |
| reward: -95.263120 |
| reward: -97.160515 |
| reward: -90.513432 |
| reward: -94.303352 |
| reward: -92.408401 |
| reward: -97.159643 |
| reward: -87.760315 |
| reward: -96.227770 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 640      |
| mean episode reward    | -93.6    |
| steps                  | 6263     |
-------------------------------------
| reward: -95.253072 |
| reward: -90.548809 |
| reward: -95.243143 |
| reward: -95.248471 |
| reward: -96.210542 |
| reward: -95.246706 |
| reward: -94.298751 |
| reward: -95.282202 |
| reward: -94.3

Agent missed 1 observation(s).


| reward: -97.159924 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 690      |
| mean episode reward    | -94      |
| steps                  | 6637     |
-------------------------------------
| reward: -95.243931 |
| reward: -91.502431 |
| reward: -85.815644 |
| reward: -93.329782 |
| reward: -91.507531 |
| reward: -96.213471 |
| reward: -94.323554 |
| reward: -94.340254 |
| reward: -96.221532 |
| reward: -96.229138 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 700      |
| mean episode reward    | -93.9    |
| steps                  | 6716     |
-------------------------------------
| reward: -96.217131 |
| reward: -96.217681 |
| reward: -91.498037 |
| reward: -95.279120 |
| reward: -96.215009 |
| reward: -96.211531 |
| reward: -97.161398 |
| reward: -93.378476 |
| reward: -93.353336 |
| reward: -93.378506 |
-------------------------------------
| % time spent explorin

Agent missed 1 observation(s).


| reward: -88.658497 |
| reward: -94.339500 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 740      |
| mean episode reward    | -93      |
| steps                  | 7084     |
-------------------------------------
| reward: -92.446566 |
| reward: -97.161375 |
| reward: -95.254431 |
| reward: -90.532239 |
| reward: -96.226361 |
| reward: -81.358144 |
| reward: -92.402352 |
| reward: -81.982249 |
| reward: -92.442672 |
| reward: -95.248799 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 750      |
| mean episode reward    | -92.7    |
| steps                  | 7183     |
-------------------------------------
| reward: -95.282201 |
| reward: -95.277905 |
| reward: -96.221621 |
| reward: -93.352678 |
| reward: -88.619786 |
| reward: -83.936485 |
| reward: -95.254399 |
| reward: -95.248360 |
| reward: -83.052864 |
| reward: -93.355131 |
-------------------------------------


Agent missed 1 observation(s).


| reward: -95.242499 |
| reward: -92.431507 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 770      |
| mean episode reward    | -92      |
| steps                  | 7373     |
-------------------------------------
| reward: -83.958462 |
| reward: -88.624872 |
| reward: -95.289124 |
| reward: -93.351792 |
| reward: -93.354119 |
| reward: -90.478090 |
| reward: -92.420547 |
| reward: -96.212046 |
| reward: -94.323780 |
| reward: -87.714204 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 780      |
| mean episode reward    | -92.2    |
| steps                  | 7472     |
-------------------------------------
| reward: -95.284868 |


Agent missed 1 observation(s).


| reward: -89.531119 |
| reward: -94.338241 |
| reward: -86.508819 |
| reward: -90.560044 |
| reward: -93.349074 |
| reward: -93.394956 |
| reward: -96.195144 |


Agent missed 1 observation(s).


| reward: -84.860122 |
| reward: -94.296660 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 790      |
| mean episode reward    | -92.1    |
| steps                  | 7568     |
-------------------------------------
| reward: -96.194992 |
| reward: -95.281378 |
| reward: -94.310303 |
| reward: -95.282201 |
| reward: -90.542279 |
| reward: -95.279858 |
| reward: -73.340215 |
| reward: -91.502823 |
| reward: -88.665187 |
| reward: -94.294670 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 800      |
| mean episode reward    | -91.9    |
| steps                  | 7667     |
-------------------------------------
| reward: -78.170593 |
| reward: -94.329274 |
| reward: -95.239338 |
| reward: -97.159643 |
| reward: -79.813331 |
| reward: -92.456865 |
| reward: -96.213891 |
| reward: -97.159643 |
| reward: -96.221488 |
| reward: -85.772744 |
-------------------------------------


Agent missed 1 observation(s).


| reward: -76.373721 |
| reward: -94.349960 |
| reward: -96.234567 |
| reward: -95.287393 |
| reward: -95.279423 |
| reward: -82.669861 |
| reward: -96.215062 |
| reward: -96.214062 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 830      |
| mean episode reward    | -91.9    |
| steps                  | 7943     |
-------------------------------------
| reward: -97.159643 |
| reward: -96.213857 |
| reward: -95.252294 |
| reward: -96.194989 |
| reward: -94.323780 |
| reward: -95.282197 |
| reward: -92.445989 |
| reward: -95.281855 |
| reward: -97.159924 |
| reward: -97.159643 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 840      |
| mean episode reward    | -92.2    |
| steps                  | 7999     |
-------------------------------------
| reward: -96.212986 |
| reward: -93.389222 |
| reward: -93.393715 |
| reward: -88.369757 |
| reward: -95.267603 |
| reward: -85.7

Agent missed 1 observation(s).


| reward: -94.347571 |
| reward: -63.616068 |
| reward: -95.275739 |
| reward: -97.159924 |
| reward: -96.215783 |
| reward: -95.290152 |
| reward: -95.280409 |
| reward: -93.361466 |
| reward: -91.533518 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 990      |
| mean episode reward    | -93.5    |
| steps                  | 9238     |
-------------------------------------
| reward: -96.211531 |
| reward: -93.378331 |
| reward: -96.212218 |
| reward: -97.159814 |
| reward: -95.287236 |
| reward: -86.765056 |
| reward: -78.646805 |
| reward: -95.279120 |
| reward: -95.284524 |
| reward: -95.287475 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1000     |
| mean episode reward    | -93.2    |
| steps                  | 9322     |
-------------------------------------
| reward: -95.281785 |
| reward: -96.216161 |
| reward: -95.287581 |
| reward: -94.295456 |
| reward: -95.2

Agent missed 1 observation(s).


| reward: -80.363696 |
| reward: -96.215530 |
| reward: -89.581738 |
| reward: -97.159643 |
| reward: -94.340815 |
| reward: -96.214073 |
| reward: -96.214858 |
| reward: -91.499472 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1160     |
| mean episode reward    | -92.6    |
| steps                  | 10708    |
-------------------------------------
| reward: -91.481795 |
| reward: -87.698477 |
| reward: -84.838623 |
| reward: -93.375581 |
| reward: -92.422486 |
| reward: -96.229086 |
| reward: -94.339165 |
| reward: -78.604880 |
| reward: -97.159643 |
| reward: -92.417762 |
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1170     |
| mean episode reward    | -92.1    |
| steps                  | 10814    |
-------------------------------------
| reward: -97.159643 |
| reward: -96.218912 |
| reward: -96.227770 |
| reward: -95.287465 |
| reward: -95.275986 |
| reward: -90.5

KeyboardInterrupt: 

# Tests

In [12]:
np.ones_like((5,5,3))

array([1, 1, 1])

In [None]:
def reward_proc(info,grid_shape = (41,41)):
    if "grid" not in info['observation'].keys():
        return 0
    # reformat grid to a vector that only show the floor with blocks
    vec = []    
    for item in info['observation']['grid'][::-1]:
        if 'lava' in item:
            vec.append(1)
        elif 'lapis' in item:
            vec.append(2)
        else:
            vec.append(0)

     # compute reward depending on distance to target
    new_state = np.array(vec)
    tmp = np.array(vec).reshape(grid_shape)
    idx2 = np.argwhere(tmp == 2)

    size = grid_shape[0]
    idx1 = (np.ceil(size/2),np.ceil(size/2))

    a = (self._dist(idx2,idx1))
    if(a > 0):
        try:
            dist_reward = 2000/(a)
        except:
            dist_reward = 0
        return dist_reward

In [None]:
def update(x,y,handle,plot):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

In [None]:
fig_test = figure(plot_width=1000, plot_height=400,title="rewards_test",
                      x_axis_label="x",
                      y_axis_label="y")
test_plot = fig_test.line([],[],color="navy",line_width=2)
# make a grid
handle_test = show(fig_test, notebook_handle=True)

In [None]:
count = 0
for e in range(100):
    done = False
    obs = env.reset()
    #env.render()
    count = 0
    while not done:
        
        #env.render()
        a = env.action_space.sample()
        s,r,done,info = env.step(a)
        r_ = proc2_reward(info)
        print(r_)
        r += r_
        update(count,r_,handle_test,test_plot)
        count += 1
    

In [None]:
env.close()

In [None]:
env.reset()
s,r,done,info = env.step(0)

In [None]:
s,r,done,info = env.step(0)
info

In [None]:
proc2_reward(info)

In [None]:
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
                q_func= model,
                num_actions=env.action_space.n,
                optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
            )
    

In [56]:
 make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name)


In [57]:
q_func= model

In [58]:
optimizer=tf.train.AdamOptimizer(learning_rate=0.001)

In [59]:
obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))

In [61]:
q_t = q_func(obs_t_input.get(), 6, scope="q_func", reuse=True)  # reuse parameters from act

In [62]:
q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

In [65]:
tf.trainable_variables()

[<tf.Variable 'deepq/eps:0' shape=() dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/conv2d_1/kernel:0' shape=(8, 8, 1, 32) dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/conv2d_1/bias:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/conv2d_2/kernel:0' shape=(3, 3, 32, 64) dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/conv2d_2/bias:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/dense_1/kernel:0' shape=(61504, 6) dtype=float32_ref>,
 <tf.Variable 'deepq/q_func/dense_1/bias:0' shape=(6,) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/conv2d_3/kernel:0' shape=(8, 8, 1, 32) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/conv2d_3/bias:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/conv2d_4/kernel:0' shape=(3, 3, 32, 64) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/conv2d_4/bias:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/dense_2/kernel:0' shape=(61504, 6) dtype=float32_ref>,
 <tf.Variable 'deepq_1/q_func/dens