In [68]:
import MalmoPython
import os
import sys
import time
import random
import json
import itertools
import math

In [37]:
from baselines import deepq
from baselines import logger
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

In [38]:
import tensorflow as tf
import tensorflow.contrib.layers as layers
import baselines.common.tf_util as U

In [39]:
from IPython.display import clear_output,display
import numpy as np

In [40]:
from keras import backend as Kend
from keras.layers import GRU,Dense,Activation,Input,LSTM
from keras.models import Sequential

In [41]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
output_notebook()

## Make Env be able to interface with code

In [59]:
class Env:
    def __init__(self,actions,obs_shape = 3):
        self.world_state = None
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.data = None
        self.observation_space = np.zeros(shape=(obs_shape**2,))
        self.grid = obs_shape
        self.actions = actions
        
        self.host = MalmoPython.AgentHost()
        try:
            self.host.parse( sys.argv )
        except RuntimeError as e:
            print ('ERROR:',e)
            print (self.host.getUsage())
            if self.host.receivedArgument("help"):
                print (self.host.getUsage())
                exit(0)
    def _dist(self,x,y):
        return np.sqrt(np.sum((x-y)**2))
    def _locate_closest(self,block_id,grid,pos):
        block_loc = np.argwhere(grid==block_id)
        
        if (block_loc.size == 0):
            return None
        else:
            block_dist = []
            for idx in block_loc:
                block_dist.append(self._dist(idx,pos))
            return np.min(np.array(block_dist))
        
    def observe(self):
        num_frames_seen = self.world_state.number_of_video_frames_since_last_state
        while self.world_state.is_mission_running and self.world_state.number_of_video_frames_since_last_state == num_frames_seen:
            self.world_state = self.host.peekWorldState()
        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        
        if self.world_state.number_of_observations_since_last_state > 0:
            self.data = json.loads(self.world_state.observations[-1].text)
        

        state = self.world_state
            
        #data = json.loads(state.observations[-1].text)
        # get reward if detected, else reward is -1
        reward = 0
        if state.number_of_rewards_since_last_state > 0:
            reward = state.rewards[0].getValue()
    
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in self.data['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)
        
        
        #calculate distance to target and assign reward
        new_state = np.array(vec)
        
        #if self.world_state.is_mission_running is False and new_state.sum() != 0.1:
        #    return 0,0,0,self.world_state
        
        tmp = np.array(vec).reshape(self.grid,self.grid)
        center = int(np.floor(self.grid/2))
        cntr_idx = (center,center)
                     
        target_dist = self._locate_closest(2,tmp,cntr_idx)
        
        fire_dist = self._locate_closest(1,tmp,cntr_idx)
        
        lnd_dist = self._locate_closest(0,tmp,cntr_idx)
        
        tmp[cntr_idx] = 5
            
        proto_state = [0,0,0]
        reward+=0.01
        
        if target_dist:
            reward += (1/(1+target_dist))*1000
            proto_state[0] = target_dist
        if fire_dist:
            reward += (1/(1+fire_dist))*-100
            proto_state[1] = fire_dist
        if lnd_dist:
            #reward += (1/(1+lnd_dist))
            proto_state[2] = lnd_dist
        return(reward,np.array(proto_state),self.data,state) # return r,s,data,extra_info
        
    def startworld(self,world_file):
        with open(world_file,'r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print ("Error starting mission:{}".format(e))
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        #print ("Waiting for the mission to start ")
        self.world_state = self.host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.host.getWorldState()
            for error in self.world_state.errors:
                print ("Error:",error.text)
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.observe()
    def quit(self):
        self.host.sendCommand('quit')
        
    def step(self,action):
        self.host.sendCommand(self.actions[action])
        return self.observe()

### Actions

In [43]:
actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    },
    'pitch':{
        'up':'pitch -0.03',
        'down':'pitch 0.03'
    },
    'turn':{
        'anti':'turn -1',
        'clk':'turn 1'
    },
    'jump':{
        'on':'jump 1',
        'off':'jump 0'
    },
    'attack':{
        'on': 'attack 1',
        'off': 'attack 0'
    },
    'use':{
        'on': 'use 1',
        'off': 'use 0'
    },
    'crouch':{
        'on':'crouch 1',
        'off':'crouch 0'
    }
}

In [44]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])

## Test ground

In [45]:
agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print ('ERROR:',e)
    print (agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print (agent_host.getUsage())
    exit(0)

ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test




In [66]:
p = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y")
test_plot = p.line([],[],color="firebrick",line_width=2)
# make a grid
test_handle = show(p, notebook_handle=True)

In [67]:
tempenv = Env(None,61
             d)
r,s,data,ws = tempenv.startworld("CliffWalking.xml")
R = 0
for t in itertools.count():
    clear_output(wait=True)
    display(s,r)

    #time.sleep(1)
    R += r
    
    update(t,R,test_plot,test_handle)
    if ws.is_mission_running is False:
        r,s,data,ws = tempenv.startworld("CliffWalking.xml")
        #clear_output(wait=True)
        #display(s,r)
        print('done')
    r,s,data,ws = tempenv.observe()


array([ 11.18033989,   1.        ,   0.        ])

32.109515221765719



KeyboardInterrupt: 

## Run Code

In [62]:
p = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y")
rewards_plot = p.line([],[],color="firebrick",line_width=2)
# make a grid
handle = show(p, notebook_handle=True)
figure

<function bokeh.plotting.figure.figure>

In [63]:
def update(x,y,plot,handler):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handler)

In [64]:
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out

def Train(env,world):
    U.reset()
    with U.make_session(2) as sess:
        replay_buffer = ReplayBuffer(5000)
        exploration = LinearSchedule(schedule_timesteps=1000, initial_p=0.9, final_p=0.5)
        episode_rewards = [0.0]
        
        Kend.set_session(sess)
        
        def new_model(inpt, num_actions, scope, reuse=False):
            with tf.variable_scope(scope, reuse=reuse):
                out = Dense(64,activation='relu')(inpt)
                out = Dense(num_actions,activation='softmax')(out)
                return out
        
        r,s,data,ws = env.startworld(world)
        act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: U.BatchInput((3,), name=name), # prev 49
        q_func=model,
        num_actions= len(env.actions), #prev len(ractions)
        optimizer=tf.train.AdamOptimizer(learning_rate=0.5)
        )
        U.initialize()
        update_target()
        
        R = 0
        episode = 0
        for t in itertools.count():
            # exploration schedule update_eps=exploration.value(t)
            update_eps=exploration.value(t)
            action = act(s[None])[0]

            r,s_,data,ws = env.step(action)

            done = ws.is_mission_running is False
            replay_buffer.add(s,action,r,s_,done)


            s = s_

            episode_rewards[-1] += r
            
            R += r
            #writer = tf.summary.FileWriter("logs", sess.graph)
            if done: #mission is done
                _,s,_,ws = env.startworld('CliffWalking.xml')
                clear_output(wait=True) 
                display("mission done reward : {} @ t = {}".format(episode_rewards[-1],t))
                update(episode,episode_rewards[-1],rewards_plot,handle)
                episode_rewards.append(0)
                episode+=1
                time.sleep(0.5) # give env time to reset
                #s = gym_env.reset()


            is_solved = t > 100 and np.mean(episode_rewards) >= 10000
            
            if t%100 == 0:
                display('reward @ t= {} is {}'.format(t,r))
            
            if is_solved:
                # Show off the result
                whaa = 5+2
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()
                    clear_output(wait=True)             
                    display(r)

                if t % 100 == 0 and t > 5:
                    #display(r)
                    whaa = 5

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()
        #writer.close()
                

In [65]:
env = Env(ractions,61)
Train(env,'CliffWalking.xml')

'mission done reward : -8855.057215672814 @ t = 241'



Mission Started.............

KeyError: 'grid'

In [None]:
env.quit()