In [1]:
import MalmoPython
import os
import sys
import time
import random
import json
import numpy as numpy
import time
from IPython.display import clear_output,display
import logging
import math
actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    },
    'pitch':{
        'up':'pitch -0.03',
        'down':'pitch 0.03'
    },
    'turn':{
        'anti':'turn -1',
        'clk':'turn 1'
    },
    'jump':{
        'on':'jump 1',
        'off':'jump 0'
    },
    'attack':{
        'on': 'attack 1',
        'off': 'attack 0'
    },
    'use':{
        'on': 'use 1',
        'off': 'use 0'
    },
    'crouch':{
        'on':'crouch 1',
        'off':'crouch 0'
    }
}
# Create default Malmo objects:

In [2]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])


## Brain

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [4]:
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        # self.model.load_weights("cartpole-basic.h5")

    def _createModel(self):
        model = Sequential()

        model.add(Dense(output_dim=64, activation='relu', input_dim=self.stateCnt))
        model.add(Dense(output_dim=self.actionCnt, activation='linear'))

        opt = RMSprop(lr=0.00025)
        model.compile(loss='mse', optimizer=opt)

        return model

    def train(self, x, y, epoch=1, verbose=0):
        self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)

    def predict(self, s):
        return numpy.array(self.model.predict(s))

    def predictOne(self, s):
        return self.predict(s.reshape(1, self.stateCnt)).flatten()


In [5]:
def train(self, x, y):
    model.fit(x, y, batch_size=64)


## Memory

In [6]:
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

## Agent

In [7]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt,actions):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.actions = actions

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        # Epsilon greedy action selection
        if random.random() < self.epsilon:
            act_int = random.randint(0, self.actionCnt-1)
        else:
            act_int = numpy.argmax(self.brain.predictOne(s))
        return self.actions[act_int],act_int

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ o[0] for o in batch ])
        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])

        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_)

        x = numpy.zeros((batchLen, self.stateCnt))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]
            a = o[1]
            r = o[2]
            s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])
                

            x[i] = s
            y[i] = t
            
            self.brain.train(x, y)

## Environment

In [8]:
class Env:
    def __init__(self, agent,agent_host):
        self.world_state = None
        self.agent = agent
        self.agent_host = agent_host
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.episode_length = 100
        self.data = None
    def parse_state(self):
        #data = json.loads(state.observations[-1].text)
        
        self.world_state = self.agent_host.peekWorldState()
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.agent_host.peekWorldState()
   
        # If there are some new observations
        if self.world_state.number_of_observations_since_last_state > 0:
            print("observed")
            self.data = json.loads(self.world_state.observations[-1].text)
        
        state = self.world_state
            
        #data = json.loads(state.observations[-1].text)
        # get reward if detected, else reward is -1
        reward = -1
        if state.number_of_rewards_since_last_state > 0:
            reward = state.rewards[0].getValue()
    
        # reformat grid to a vector that only show the floor with blocks
        grid = self.data['grid'][:16]
        new_grid = list()
        for i,item in enumerate(grid):
            if 'lava' in item:
                new_grid.append(1)
            else:
                new_grid.append(0)
        #self.data['grid'] = new_grid
        
        return(reward,new_grid,self.data,state)
        
    def startworld(self,world_file):
        
        with open('CliffWalking.xml','r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.agent_host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print "Error starting mission:",e
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        print "Waiting for the mission to start ",
        self.world_state = self.agent_host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.agent_host.getWorldState()
            for error in self.world_state.errors:
                print "Error:",error.text
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.agent_host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.parse_state() 
    def run(self,world,epochs=0):
        # load world
        R = 0
        for i in range(epochs):
            _, s, obs,ws = self.startworld(world)
            mission_run = ws.is_mission_running
            while(mission_run):
                # get state
                time.sleep(1)
                
                # act
                send_a,a = agent.act(s)
                print("action:{}".format(send_a))
                self.agent_host.sendCommand(send_a)

                
                # observe
                #print(a)
                r,s_prime,obs,ws = self.parse_state()
                mission_run = ws.is_mission_running
                #print("sample:{}".format(a))
                self.agent.observe((s,a,r,s_prime))
                self.agent.replay()
                
                s = s_prime
                R += r
                if (not  mission_run):
                    print("episode done")
                    time.sleep(1)
                
            print("done epoch: {}".format(i))
                
                
                
            
                
        #loop
        # observe
        
        # take action

## Test for manual code execution

In [9]:

agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print 'ERROR:',e
    print agent_host.getUsage()
    exit(1)
if agent_host.receivedArgument("help"):
    print agent_host.getUsage()
    exit(0)



ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test




In [61]:
# load world
with open('CliffWalking.xml','r') as f:
    my_mission = MalmoPython.MissionSpec(f.read(), True)
my_mission_record = MalmoPython.MissionRecordSpec()
 
# Attempt to start a mission:
max_retries = 3

for retry in range(max_retries):
    try:
        agent_host.startMission( my_mission, my_mission_record )
        break
    except RuntimeError as e:
        if retry == max_retries - 1:
            print "Error starting mission:",e
            exit(1)
        else:
            time.sleep(2)

# Loop until mission starts:
print "Waiting for the mission to start ",
world_state = agent_host.getWorldState()

while not world_state.has_mission_begun:
    sys.stdout.write(".")
    time.sleep(0.1)
    world_state = agent_host.getWorldState()
    for error in world_state.errors:
        print "Error:",error.text
 

Waiting for the mission to start .....


In [11]:
test_env = Env(None,agent_host)

"""
while world_state.is_mission_running:
    time.sleep(2)
    world_state = agent_host.getWorldState()
    # print(chose_act)
    clear_output(wait=True)
    display(test_env.parse_state())
    # agent_host.sendCommand(chose_act)
    for error in world_state.errors:
        print "Error:",error.text
"""

test_env.startworld('CliffWalking.xml')


Mission StartedWaiting for the mission to start ..... observed


(-1,
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 {u'Air': 300,
  u'DamageDealt': 0,
  u'DamageTaken': 0,
  u'DistanceTravelled': 2688,
  u'Food': 20,
  u'IsAlive': True,
  u'Life': 20.0,
  u'MobsKilled': 0,
  u'Name': u'Cristina',
  u'Pitch': 0.0,
  u'PlayersKilled': 0,
  u'Score': 0,
  u'TimeAlive': 24819,
  u'TotalTime': 24828,
  u'WorldTime': 6000,
  u'XP': 0,
  u'XPos': 4.5,
  u'YPos': 46.0,
  u'Yaw': 0.0,
  u'ZPos': 1.5,
  u'grid': [u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air',
   u'air']},
 <MalmoPython.WorldState at 0x7f8f419525f0>)

In [42]:
test_env.parse_state()

observed


(-1,
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 {u'Air': 300,
  u'DamageDealt': 0,
  u'DamageTaken': 0,
  u'DistanceTravelled': 2688,
  u'Food': 20,
  u'IsAlive': True,
  u'Life': 20.0,
  u'MobsKilled': 0,
  u'Name': u'Cristina',
  u'Pitch': 0.0,
  u'PlayersKilled': 0,
  u'Score': 0,
  u'TimeAlive': 20737,
  u'TotalTime': 20746,
  u'WorldTime': 6000,
  u'XP': 0,
  u'XPos': 4.5,
  u'YPos': 46.0,
  u'Yaw': 0.0,
  u'ZPos': 1.5,
  u'grid': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 <MalmoPython.WorldState at 0x7f49eeef3320>)

In [None]:
agent_host.sendCommand('quit')

In [54]:
a = numpy.ndarray([1,2,3])
numpy.amax(a)

2.4703282292062327e-323

In [77]:
agent_host.getWorldState().rewards[0].getValue()

-101.0

In [36]:
agent = Agent(16,len(ractions),ractions)



In [37]:
env = Env(agent,agent_host)

In [38]:
env.run("CliffWalking.xml",100)

Mission StartedWaiting for the mission to start ..... observed
action:move -1
observed


ValueError: Error when checking : expected dense_9_input to have shape (None, 16) but got array with shape (60, 1)