In [1]:
import MalmoPython
import os
import sys
import time
import random
import json
import numpy as np
import time
from IPython.display import clear_output,display
import logging
import math
actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    },
    'pitch':{
        'up':'pitch -0.03',
        'down':'pitch 0.03'
    },
    'turn':{
        'anti':'turn -1',
        'clk':'turn 1'
    },
    'jump':{
        'on':'jump 1',
        'off':'jump 0'
    },
    'attack':{
        'on': 'attack 1',
        'off': 'attack 0'
    },
    'use':{
        'on': 'use 1',
        'off': 'use 0'
    },
    'crouch':{
        'on':'crouch 1',
        'off':'crouch 0'
    }
}
# Create default Malmo objects:

In [2]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot

from IPython.display import clear_output,display
output_notebook()

In [3]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])

## Brain

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation,GRU,Input,LSTM
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [5]:
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        # self.model.load_weights("cartpole-basic.h5")

    def _createModel(self):
        model = Sequential()
        
        model.add(Dense(output_dim=64, activation='relu', input_dim=self.stateCnt))
        model.add(Dense(output_dim=self.actionCnt, activation='linear'))
        
        #model.add(GRU(64,input_shape=(None,49),return_sequences=True))
        #model.add(GRU(4))
        
        
        opt = RMSprop(lr=0.00025)
        model.compile(loss='mse', optimizer=opt)

        return model

    def train(self, x, y, epoch=1, verbose=0):
        self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)

    def predict(self, s):
        return np.array(self.model.predict(s))

    def predictOne(self, s):
        return self.predict(s.reshape(1, self.stateCnt)).flatten()



## Memory

In [6]:
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity
        

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

## Agent

In [7]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt,actions):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.actions = actions

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        # Epsilon greedy action selection
        if random.random() < self.epsilon:
            act_int = random.randint(0, self.actionCnt-1)
        else:
            act_int = np.argmax(self.brain.predictOne(s))
        return self.actions[act_int],act_int

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = np.zeros(self.stateCnt)
        
        
        
        states = np.array([ o[0] for o in batch ])
        states_ = np.array([ (no_state if o[3] is None else o[3]) for o in batch ])
        
        #print("SHAPE IS {}".format(states.shape))
        
        #np.pad(states,(0,))
        
        
        
        
        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_)

        x = np.zeros((batchLen, self.stateCnt))
        y = np.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]
            a = o[1]
            r = o[2]
            s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * np.amax(p_[i])
            x[i] = s
            y[i] = t
            
            self.brain.train(x, y)

## Environment

In [8]:
class Env:
    def __init__(self,actions,obs_shape = 3):
        self.world_state = None
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.data = None
        self.observation_space = np.zeros(shape=(obs_shape**2,))
        
        self.actions = actions
        
        self.host = MalmoPython.AgentHost()
        try:
            self.host.parse( sys.argv )
        except RuntimeError as e:
            print ('ERROR:',e)
            print (self.host.getUsage())
            if self.host.receivedArgument("help"):
                print (self.host.getUsage())
                exit(0)
    def _dist(self,x,y):
        return np.sqrt(np.sum((x-y)**2))
    def observe(self):
        
        self.world_state = self.host.peekWorldState()
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
   
        # If there are some new observations
        if self.world_state.number_of_observations_since_last_state > 0:
            self.data = json.loads(self.world_state.observations[-1].text)
        
        state = self.world_state
            
        #data = json.loads(state.observations[-1].text)
        # get reward if detected, else reward is -1
        reward = 1
        if state.number_of_rewards_since_last_state > 0:
            reward = state.rewards[0].getValue()
    
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in self.data['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)
        
        
        # compute reward depending on distance to target
        new_state = np.array(vec)
        tmp = np.array(vec).reshape(self.observation_space.shape)
        idx2 = np.argwhere(tmp == 2)
        
        size = self.observation_space.shape[0]
        idx1 = (np.ceil(size/2),np.ceil(size/2))
                     
        a = (self._dist(idx2,idx1))
        if(a > 0):
            dist_reward = 100 - a
            reward += dist_reward
            print("close to objective reward : {}".format(dist_reward))
            
            
        
        
        return(reward,new_state,self.data,state) # return r,s,data,extra_info
        
    def startworld(self,world_file):
        with open(world_file,'r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print ("Error starting mission:{}".format(e))
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        #print ("Waiting for the mission to start ")
        self.world_state = self.host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.host.getWorldState()
            for error in self.world_state.errors:
                print ("Error:",error.text)
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.observe()
    def quit(self):
        self.host.sendCommand('quit')
        
    def step(self,action):
        self.host.sendCommand(self.actions[action])
        return self.observe()

## Test for manual code execution

In [None]:

agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print ('ERROR:',e)
    print (agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print (agent_host.getUsage())
    exit(0)



In [None]:

r,s,data,ws = env.startworld('CliffWalking.xml')


In [13]:
env = Env(ractions,11)
r,s,data,ws = env.startworld('CliffWalking.xml')
done = ws.is_mission_running is False
print(s.shape)
while(not done):
    time.sleep(0.7)
    clear_output(wait=True)
    r,s,data,ws = env.observe()
    display(s.reshape(11,11),s.shape)
    
    done = ws.is_mission_running is False
    
    
    

close to objective reward : 51.91673887931477


array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 2, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0]])

(121,)

## Plots

In [9]:
fig1 = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y",sizing_mode='scale_width')
rplot = fig1.line([],[],color="firebrick",line_width=2)
# make a grid
handle1 = show(fig1, notebook_handle=True)

In [10]:
def update(x,y,handle,plot):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

## Run Code

In [None]:
env.quit()

In [11]:
agent = Agent(11**2,4,ractions)
env = Env(ractions,11)



ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test




In [12]:
r,s,data,ws = env.startworld('CliffWalking.xml')
R = [0.0]
for t in range(100000):
    r,s,data,ws = env.startworld('CliffWalking.xml')
    done = ws.is_mission_running is False
    while(not done):
        send_a, a = agent.act(s)
        r,s_,data,ws = env.step(a)
        done = ws.is_mission_running is False
        
        agent.observe((s,a,r,s_))
        agent.replay()
        
        s = s_
        R[-1] += r
    update(t,R[-1],handle1,rplot)
    R.append(0.0)
    
    print('done play through {}'.format(t))
    

Mission Started................Error starting mission:A mission is already running.




done play through 0
Mission Started..................



done play through 1
Mission Started................



done play through 2
Mission Started..................



done play through 3
Mission Started...............



done play through 4
Mission Started...............



done play through 5
Mission Started.................



done play through 6
Mission Started.................



done play through 7
Mission Started................



done play through 8
Mission Started..............



done play through 9
Mission Started.................



done play through 10
Mission Started...............



done play through 11
Mission Started...............



done play through 12
Mission Started.............



done play through 13
Mission Started...................



done play through 14
Mission Started..............



done play through 15
Mission Started............



done play through 16
Mission Started................



done play through 17
Mission Started..............



done play through 18
Mission Started..............



done play through 19
Mission Started...............



done play through 20
Mission Started..................



done play through 21
Mission Started..................



done play through 22
Mission Started................



done play through 23
Mission Started..............



done play through 24
Mission Started...............



done play through 25
Mission Started.................



done play through 26
Mission Started..................



done play through 27
Mission Started................



done play through 28
Mission Started................



done play through 29
Mission Started..................



done play through 30
Mission Started................



done play through 31
Mission Started..................



done play through 32
Mission Started.................



done play through 33
Mission Started.................



done play through 34
Mission Started................



done play through 35
Mission Started...................



.done play through 36
Mission Started.................



done play through 37
Mission Started.....................



done play through 38
Mission Started...............



done play through 39
Mission Started...............



done play through 40
Mission Started...............



done play through 41
Mission Started...............



done play through 42
Mission Started.................



done play through 43
Mission Started................



done play through 44
Mission Started.................



done play through 45
Mission Started................



done play through 46
Mission Started................



done play through 47
Mission Started...............



done play through 48
Mission Started...............



done play through 49
Mission Started...............



done play through 50
Mission Started...............



done play through 51
Mission Started..............



done play through 52
Mission Started...................



done play through 53
Mission Started..................



done play through 54
Mission Started.........

KeyboardInterrupt: 

In [None]:
a = np.array(range(27)).reshape(3,3,3)
a.shape

In [None]:
np.pad(a,[(0,3),(0,0),(0,0)], 'constant')

In [None]:
b = np.array(range(49)).reshape(None,1,49)

In [None]:
b

In [None]:
np.pad(b)