In [1]:
import MalmoPython
import os
import sys
import time
import random
import json
import numpy as np
from IPython.display import clear_output,display
actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    },
    'pitch':{
        'up':'pitch -0.03',
        'down':'pitch 0.03'
    },
    'turn':{
        'anti':'turn -1',
        'clk':'turn 1'
    },
    'jump':{
        'on':'jump 1',
        'off':'jump 0'
    },
    'attack':{
        'on': 'attack 1',
        'off': 'attack 0'
    },
    'use':{
        'on': 'use 1',
        'off': 'use 0'
    },
    'crouch':{
        'on':'crouch 1',
        'off':'crouch 0'
    }
}
# Create default Malmo objects:

In [2]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])


In [3]:

agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print 'ERROR:',e
    print agent_host.getUsage()
    exit(1)
if agent_host.receivedArgument("help"):
    print agent_host.getUsage()
    exit(0)



ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test




## Brain

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [16]:
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        # self.model.load_weights("cartpole-basic.h5")

    def _createModel(self):
        model = Sequential()

        model.add(Dense(output_dim=64, activation='relu', input_dim=self.stateCnt))
        model.add(Dense(output_dim=self.actionCnt, activation='linear'))

        opt = RMSprop(lr=0.00025)
        model.compile(loss='mse', optimizer=opt)

        return model

    def train(self, x, y, epoch=1, verbose=0):
        self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)

    def predict(self, s):
        return self.model.predict(s)

    def predictOne(self, s):
        return self.predict(s.reshape(1, self.stateCnt)).flatten()


In [17]:
def train(self, x, y):
    model.fit(x, y, batch_size=64)


## Memory

In [18]:
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

## Agent

In [19]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt,actions):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.actions = actions

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        # Epsilon greedy action selection
        if random.random() < self.epsilon:
            act_int = random.randint(0, self.actionCnt-1)
        else:
            act_int = numpy.argmax(self.brain.predictOne(s))
        return self.actions[act_int]

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ o[0] for o in batch ])
        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])

        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_)

        x = numpy.zeros((batchLen, self.stateCnt))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])

            x[i] = s
            y[i] = t
            
            self.brain.train(x, y)

## Environment

In [42]:
class Env:
    def __init__(self, agent,agent_host):
        self.world_state = None
        self.agent = agent
        self.agent_host = agent_host
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.episode_length = 100
    def parse_state(self,state):
        data = json.loads(state.observations[-1].text)
        
        # get reward if detected, else reward is -1
        reward = -1
        if state.number_of_rewards_since_last_state > 0:
            reward = state.rewards[0].getValue()
    
        # reformat grid to a vector that only show the floor with blocks
        grid = data['grid'][:9]
        new_grid = list()
        for i,item in enumerate(grid):
            if item == 'lava':
                new_grid.append(1)
            else:
                new_grid.append(0)
        data['grid'] = new_grid
        
        return(reward,new_grid)
        
    def startworld(self,world_file):
        # load world
        with open('CliffWalking.xml','r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(),True)
           
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.agent_host.startMission( my_mission, self.my_mission_record )
                self.world_state = agent_host.getWorldState()
                return self.world_state
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print "Error starting mission:",
                    self.world_state = agent_host.getWorldState()
                    
    def run(self,world,epochs=0):
        # load world
        R = 0
        for i in range(epochs):
            observation = self.startworld(world)
            _, s = self.parse_state(observation)
            done = !self.observation.is_mission running
            while(done):
                # get state
                
                a = agent.act(s)
                
                
                print("action:{}".format(a))
                self.agent_host.sendCommand(a)
                
                observation = self.agent_host.getWorldState()
                r, s_prime = self.parse_state(observation)
                done = !self.observation.is_mission running
                
                self.agent.observe(s,a,r,s_prime)
                self.agent.replay()
                
                s = s_prime
                R += r
                
            print("done epoch: {}".format(i))
                
                
                
            
                
        #loop
        # observe
        
        # take action

In [138]:
## helper function for sanity

def observe(world_state):
    ## this gonna add overhead :(
    data = json.loads(world_state.observations[-1].text)
    
    
    # get reward if detected, else reward is -1
    reward = -1
    if world_state.number_of_rewards_since_last_state > 0:
        reward = world_state.rewards[0].getValue()
        
    # reformat grid to a vector that only show the floor with blocks
    grid = data['grid'][:9]
    new_grid = list()
    for i,item in enumerate(grid):
        if item == 'lava':
            new_grid.append(1)
        else:
            new_grid.append(0)
            
    data['grid'] = new_grid
    
    
    return (world_state.observations[-1].timestamp, data, reward)

def act(state,action,agent):
    agent.sendCommand(action)
    _, data, reward = observe(state)
    
    return (state,action,reward,data['grid'])

In [136]:
while world_state.is_mission_running:
    time.sleep(2)
    world_state = agent_host.getWorldState()
    # print(chose_act)
    clear_output(wait=True)
    display(observe(world_state))
    # agent_host.sendCommand(chose_act)
    for error in world_state.errors:
        print "Error:",error.text

(datetime.datetime(2017, 10, 10, 19, 1, 29, 36442),
 {u'Air': 300,
  u'DamageDealt': 0,
  u'DamageTaken': 1660,
  u'DistanceTravelled': 10093,
  u'Food': 20,
  u'IsAlive': True,
  u'Life': 20.0,
  u'MobsKilled': 0,
  u'Name': u'Cristina',
  u'Pitch': 0.0,
  u'PlayersKilled': 0,
  u'Score': 0,
  u'TimeAlive': 6496,
  u'TotalTime': 342607,
  u'WorldTime': 6000,
  u'XP': 0,
  u'XPos': 4.5,
  u'YPos': 46.0,
  u'Yaw': 0.0,
  u'ZPos': 1.5,
  u'grid': [1, 1, 1, 0, 0, 1, 0, 1, 1]},
 -1)

KeyboardInterrupt: 

In [122]:
agent_host.sendCommand('quit')

In [96]:
world_state.rewards[-1]

IndexError: Index out of range

In [102]:
agent_host.getWorldState().rewards[0].getValue()

IndexError: Index out of range

In [21]:
agent = Agent(1000,len(ractions),ractions)




In [43]:
env = Env(agent,agent_host)

In [44]:
env.run("CliffWalking.xml",1)

IndexError: Index out of range