In [1]:
import MalmoPython
import os
import sys
import time
import random
import json
import itertools
import math

In [2]:
from baselines import deepq
from baselines import logger
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

In [3]:
import tensorflow as tf
import tensorflow.contrib.layers as layers
import baselines.common.tf_util as U

In [4]:
from IPython.display import clear_output,display
import numpy as np

In [5]:
from keras import backend as Kend
from keras.layers import GRU,Dense,Activation,Input,LSTM
from keras.models import Sequential

Using TensorFlow backend.


In [6]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
output_notebook()

In [None]:
show()

## Make Env be able to interface with code

In [147]:
class Env:
    def __init__(self,actions,obs_shape = 3):
        self.world_state = None
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.data = None
        self.observation_space = np.zeros(shape=(obs_shape**2,))
        self.grid = obs_shape
        self.actions = actions
        
        self.host = MalmoPython.AgentHost()
        try:
            self.host.parse( sys.argv )
        except RuntimeError as e:
            print ('ERROR:',e)
            print (self.host.getUsage())
            if self.host.receivedArgument("help"):
                print (self.host.getUsage())
                exit(0)
    def _dist(self,x,y):
        return np.sqrt(np.sum((x-y)**2))
    def _locate_closest(self,block_id,grid,pos):
        block_loc = np.argwhere(grid==block_id)
        
        if (block_loc.size == 0 and self.world_state.is_mission_running):
            return np.inf
        else:
            block_dist = []
            for idx in block_loc:
                block_dist.append(self._dist(idx,pos))
            return np.min(np.array(block_dist))
        
    def observe(self):
        
        self.world_state = self.host.peekWorldState()
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()

        # If there are some new observations
        if self.world_state.number_of_observations_since_last_state > 0:
            self.data = json.loads(self.world_state.observations[-1].text)
        
        state = self.world_state
            
        #data = json.loads(state.observations[-1].text)
        # get reward if detected, else reward is -1
        reward = 0
        if state.number_of_rewards_since_last_state > 0:
            reward = state.rewards[0].getValue()
    
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in self.data['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)
        
        
        #calculate distance to target and assign reward
        new_state = np.array(vec)
        
        if self.world_state.is_mission_running is False and new_state.sum() != 0.1:
            return 0,0,0,self.world_state
        
        tmp = np.array(vec).reshape(self.grid,self.grid)
        center = int(np.floor(self.grid/2))
        cntr_idx = (center,center)
                     
        target_dist = self._locate_closest(2,tmp,cntr_idx)
        
        fire_dist = self._locate_closest(1,tmp,cntr_idx)
        
        lnd_dist = self._locate_closest(0,tmp,cntr_idx)
        
        tmp[cntr_idx] = 5
            
        proto_state = np.array([target_dist,fire_dist,lnd_dist])
        if fire_dist == np.inf:
            print(tmp)
        reward = (1/(1+target_dist))*800 + fire_dist*100 + (1/(1+lnd_dist))
        return(reward,proto_state,tmp,state) # return r,s,data,extra_info
        
    def startworld(self,world_file):
        with open(world_file,'r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print ("Error starting mission:{}".format(e))
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        #print ("Waiting for the mission to start ")
        self.world_state = self.host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.host.getWorldState()
            for error in self.world_state.errors:
                print ("Error:",error.text)
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.observe()
    def quit(self):
        self.host.sendCommand('quit')
        
    def step(self,action):
        self.host.sendCommand(self.actions[action])
        return self.observe()

## Test ground

In [10]:
agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print ('ERROR:',e)
    print (agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print (agent_host.getUsage())
    exit(0)

ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test




In [148]:
tempenv = Env(None,21)
r,s,data,ws = tempenv.startworld("CliffWalking.xml")

for t in itertools.count():
    #clear_output(wait=True)
    display(s,r)

    time.sleep(1)
    
    if ws.is_mission_running is False:
        r,s,data,ws = tempenv.observe()
        #clear_output(wait=True)
        display(s,r,data)
        print('done')
        break
    r,s,data,ws = tempenv.observe()


ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Mission Started...............

array([ inf,   1.,   0.])

101.0

array([ inf,   1.,   0.])

101.0

array([ inf,   1.,   0.])

101.0

array([ inf,   1.,   0.])

101.0

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


array([ inf,  inf,   0.])

inf

0

0

0

0

0

done


In [105]:
tempenv.quit()

In [None]:
    world_state = agent_host.getWorldState()
    # print(chose_act)
    clear_output(wait=True)
    a = json.loads(world_state.observations[-1].text)
    
    vec = []
    for item in a['grid'][::-1]:
        if 'lava' in item:
            vec.append(1)
        else:
            vec.append(0)
    narray = np.array(vec)
    narray

In [None]:
dir(tf.layers.dense)

In [None]:
a = Sequential()
dir(a)

## Start Code

In [112]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])

In [113]:
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=65, activation_fn=tf.nn.relu)
        out = layers.fully_connected(out, num_outputs=130, activation_fn=tf.nn.relu)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=tf.nn.softmax)
        return out

In [114]:
p = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y",sizing_mode='scale_width')
rewards_plot = p.line([],[],color="firebrick",line_width=2)
# make a grid
handle = show(p, notebook_handle=True)
figure

<function bokeh.plotting.figure.figure>

In [115]:
def update(x,y):
    rewards_plot.data_source.data['x'] += [x]
    rewards_plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

In [120]:
def Train(env,world):
    U.reset()
    with U.make_session(2) as sess:
        replay_buffer = ReplayBuffer(500000)
        #exploration = LinearSchedule(schedule_timesteps=10000, initial_p=0.5, final_p=0.1)
        episode_rewards = [0.0]
        
        Kend.set_session(sess)
        
        def new_model(inpt, num_actions, scope, reuse=False):
            with tf.variable_scope(scope, reuse=reuse):
                out = Dense(64,activation='relu')(inpt)
                out = Dense(num_actions,activation='softmax')(out)
                return out
        
        r,s,data,ws = env.startworld(world)
        act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: U.BatchInput((3,), name=name), # prev 49
        q_func=model,
        num_actions= len(env.actions), #prev len(ractions)
        optimizer=tf.train.AdamOptimizer(learning_rate=0.01)
        )
        U.initialize()
        update_target()
        
        R = 0
    
        for t in itertools.count():
            # exploration schedule update_eps=exploration.value(t)
            action = act(s[None])[0]

            r,s_,data,ws = env.step(action)

            done = ws.is_mission_running is False
            replay_buffer.add(s,action,r,s_,done)


            s = s_

            episode_rewards[-1] += r
            
            R += r
            #writer = tf.summary.FileWriter("logs", sess.graph)
            if done: #mission is done
                _,s,_,ws = env.startworld('CliffWalking.xml')
                print(t,episode_rewards[-1])
                update(t,episode_rewards[-1])
                episode_rewards.append(0)
                #s = gym_env.reset()


            is_solved = t > 100 and np.mean(episode_rewards) >= 10000

            if is_solved:
                # Show off the result
                whaa = 5+2
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()
                    clear_output(wait=True)             
                    display(r)

                if t % 100 == 0 and t > 5:
                    whaa = 1

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()
        #writer.close()
                

In [121]:
env = Env(ractions,21)
Train(env,'CliffWalking.xml')

101.0

Mission Started..............74 inf




ValueError: Out of range float values are not JSON compliant

In [None]:
env.quit()

Testing with gym env

In [None]:
import gym
gym_env = gym.make("CartPole-v0")

In [None]:
dir(env_gym.observation_space.sample())

In [None]:
obs = env_gym.reset()
obs[None]

In [None]:
env_gym.action_space.n

In [None]:
obs,_,_,_ = env_gym.step(1)
obs.shape

## Testing Env class

In [None]:
env1 = Env(None)
env1.startworld('CliffWalking.xml');
r,s,data,ws = env.observe();
s[None]

In [None]:
env1.quit()

In [None]:
class a:
    def __init__(self):
        self.prop1 = 1
        self.prop2 = 2

In [None]:
show()

In [None]:
a.prop1

In [None]:
deepq.build_train??

In [None]:

def update(count):
    tmp1  = rewards.data_source.data['x']
    tmp2 = rewards.data_source.data['y']
    rewards.data_source.data['x'] += [count]
    rewards.data_source.data['y'] += [random.randint(1,10000)]
    push_notebook(handle=handle)

In [None]:
for i in range(50):
    update(i)

In [None]:
opts = dict(plot_width=250, plot_height=250, min_border=0)
p1 = figure(**opts)
r1 = p1.circle([1,2,3], [4,5,6], size=20)

p2 = figure(**opts)
r2 = p2.circle([1,2,3], [4,5,6], size=20)

# get a handle to update the shown cell with
t = show(row(p1, p2), notebook_handle=True)

In [None]:
r1.glyph.fill_color = "white"
push_notebook(handle=t)

In [None]:
r1.glyph.fill_color = "orange"
push_notebook(handle=t)

In [None]:
dir(p)