# Libraries

In [1]:
import pdb;
import scipy.misc as scimisc

from tkinter import *
from PIL import Image
from PIL import ImageTk

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.animation as animation
from PIL import Image

import MalmoPython
import os
import sys
import time
import random
import json
import numpy as np
import time
from IPython.display import clear_output,display
import logging
import math

from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
from IPython.display import clear_output,display
output_notebook()

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 18370373501150493388
]


# DQN

Here we are essentially defining the neural model to be used as the function approximator for the Q function

## Brain

In [3]:
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Activation,GRU,Input,LSTM,Conv2D,Flatten
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard


Using TensorFlow backend.


In [17]:
class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.callback = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
        self.model = self._createModel()
        # self.model.load_weights("cartpole-basic.h5")
    def save(self):
        model_json = self.model.to_json()
        with open('./models/model.json','w') as json_file:
            json_file.write(model_json)
            
        self.model.save_weights('./models/CNNmodel.h5')
    def load(self):
        self.model.load_weights('./models/CNNmodel.h5')
    def _createModel(self):
        model = Sequential()
        # Lets try a CNN to take screen as input
        # batch size is 64, 320x240 video RGB channels with an extra channel for depth
        frame_width = self.stateCnt[1]
        frame_height = self.stateCnt[0]
        model.add(Conv2D(32,(8,8),input_shape=(frame_height,frame_width,1),activation='relu'))
        model.add(Conv2D(64,(4,4),activation='relu'))
        model.add(Conv2D(64,(3,3),activation='relu'))
        model.add(Flatten())
        model.add(Dense(60,activation='relu'))
        model.add(Dense(32,activation='relu'))
        model.add(Dense(output_dim=self.actionCnt,activation='softmax'))
        
        opt = RMSprop(lr=0.00025)
        model.compile(loss='mse', optimizer=opt)

        return model

    def train(self, x, y, epoch=1, verbose=0):
        
        self.model.fit(x, y, batch_size=64, epochs=epoch, verbose=verbose)

    def predict(self, s):
        return np.array(self.model.predict(s))

    def predictOne(self, s):
        return self.predict(s.reshape(1, *self.stateCnt)).flatten()


## Memory

In [11]:
class Memory:   # stored as ( s, a, r, s_ )
    def __init__(self, capacity):
        self.samples = []
        self.capacity = capacity
        

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

## Agent

In [64]:
MEMORY_CAPACITY = 50000
BATCH_SIZE = 64

GAMMA = 0.95

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.0001      # speed of decay

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt,actions):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.actions = actions

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        # Epsilon greedy action selection
        if random.random() < self.epsilon:
            act_int = random.randint(0, self.actionCnt-1)
        else:
            act_int = np.argmax(self.brain.predictOne(s))
        return self.actions[act_int],act_int

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = np.zeros(self.stateCnt)
         
        states = np.array([ o[0] for o in batch ])
        states_ = np.array([ (no_state if o[3] is None else o[3]) for o in batch ])
        
        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_)

        x = np.zeros((batchLen, *self.stateCnt))
        y = np.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]
            a = o[1]
            r = o[2]
            s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * np.amax(p_[i])
            x[i] = s
            y[i] = t
            #print('before training')
            self.brain.train(x, y)
            #print('trained')

# Environment

In [49]:
class Env:
    def __init__(self,actions,obs_shape,grid_shape,scale = 1/12):
        self.world_state = None
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.data = None
        #self.observation_space = np.zeros(shape=(obs_shape**2,))
        self.obs_shape = obs_shape
        self.actions = actions
        self.scale = scale
        self.grid_shape = grid_shape
        self.host = MalmoPython.AgentHost()
        self.obs = None
        try:
            self.host.parse( sys.argv )
        except RuntimeError as e:
            print ('ERROR:',e)
            print (self.host.getUsage())
            if self.host.receivedArgument("help"):
                print (self.host.getUsage())
                exit(0)
    def _dist(self,x,y):
        return np.sqrt(np.sum((x-y)**2))
    
    def waitForInitialState( self ):
        '''Before a command has been sent we wait for an observation of the world and a frame.'''
        # wait for a valid observation
        world_state = self.host.peekWorldState()
        while world_state.is_mission_running and all(e.text=='{}' for e in world_state.observations):
            world_state = self.host.peekWorldState()
        # wait for a frame to arrive after that
        num_frames_seen = world_state.number_of_video_frames_since_last_state
        while world_state.is_mission_running and world_state.number_of_video_frames_since_last_state == num_frames_seen:
            world_state = self.host.peekWorldState()
        world_state = self.host.getWorldState()

        reward = 0
        smaller = None
        data = None
        if world_state.is_mission_running:
                
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            
            obs = json.loads( world_state.observations[-1].text )
            frame = world_state.video_frames[-1]
            reward,smaller,data,_ = self.process(world_state)
            return reward,smaller,data,world_state
        else:
            return None
    def waitForNextState( self ):
        '''After each command has been sent we wait for the observation to change as expected and a frame.'''
        # wait for the observation position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if not world_state.is_mission_running:
                print('mission ended.')
                break
            if not all(e.text=='{}' for e in world_state.observations):
                obs = json.loads( world_state.observations[-1].text )
                break
        # wait for the render position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if len(world_state.video_frames) > 0:
                frame = world_state.video_frames[-1]
                break
            if not world_state.is_mission_running:
                break

        reward = 0
        smaller = None
        data = None
        num_frames_before_get = len(world_state.video_frames)
        world_state = self.host.getWorldState()

        if world_state.is_mission_running:
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            num_frames_after_get = len(world_state.video_frames)
            assert num_frames_after_get >= num_frames_before_get, 'Fewer frames after getWorldState!?'
            frame = world_state.video_frames[-1]
            reward,smaller,data,_ = self.process(world_state)
        return reward,smaller,data,world_state
    def process(self,world_state):
        obs = json.loads( world_state.observations[-1].text )
        reward = 1
        if world_state.number_of_rewards_since_last_state > 0:
            reward = world_state.rewards[0].getValue()
        


        
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in obs['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)

        # read frame into numpy array (height,width,color(RGB))
        frame = np.array(world_state.video_frames[-1].pixels).reshape(self.obs_shape)
        
        # grayscale
        gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((self.obs_shape[0],self.obs_shape[1]))
        
        # scale down
        # scimisc will output an array of type uint8 for further preprocessing the data needs to be casted to float
        smaller = scimisc.imresize(gray_frame,1/12,mode='L').astype('float64')
        smaller = np.expand_dims(smaller,2)
        
        # compute reward depending on distance to target
        new_state = np.array(vec)
        tmp = np.array(vec).reshape(self.grid_shape)
        idx2 = np.argwhere(tmp == 2)
        
        size = self.grid_shape[0]
        idx1 = (np.ceil(size/2),np.ceil(size/2))
                     
        a = (self._dist(idx2,idx1))
        if(a > 0):
            try:
                dist_reward = 2000/(a)
            except:
                dist_reward = 0
            reward += dist_reward
            #print("close to objective reward : {}".format(dist_reward))
        return(reward,smaller,self.data,world_state) # return r,s,data,extra_info
    
    def observe(self,init=False):
        if( init ):
            tmp = self.waitForInitialState()
            while(tmp == None):
                tmp = self.waitForInitialState()
            return tmp
        else:
            return self.waitForNextState()
        # wait for the observation position to have changed
        '''
        while True:
            world_state = self.host.peekWorldState()

            if not all(e.text=='{}' for e in world_state.observations):
                self.obs = json.loads( world_state.observations[-1].text )
                break
            if not world_state.is_mission_running:
                break
                
        
                # wait for the render position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if not world_state.is_mission_running:
                break
            if len(world_state.video_frames) > 0:
                break
        num_frames_before_get = len(world_state.video_frames)
        world_state = self.host.getWorldState()
        
        if world_state.is_mission_running:
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            num_frames_after_get = len(world_state.video_frames)
            assert num_frames_after_get >= num_frames_before_get, 'Fewer frames after getWorldState!?'
            frame = world_state.video_frames[-1]
            self.obs = json.loads( world_state.observations[-1].text )
        
        self.obs = json.loads( world_state.observations[-1].text )
        reward = 1
        if world_state.number_of_rewards_since_last_state > 0:
            reward = world_state.rewards[0].getValue()
        


        
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in self.obs['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)

        # read frame into numpy array (height,width,color(RGB))
        frame = np.array(world_state.video_frames[-1].pixels).reshape(self.obs_shape)
        
        # grayscale
        gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((self.obs_shape[0],self.obs_shape[1]))
        
        # scale down
        smaller = scimisc.imresize(gray_frame,1/12,mode='L')
        smaller = np.expand_dims(smaller,2)
    
        # compute reward depending on distance to target
        new_state = np.array(vec)
        tmp = np.array(vec).reshape(self.grid_shape)
        idx2 = np.argwhere(tmp == 2)
        
        size = self.grid_shape[0]
        idx1 = (np.ceil(size/2),np.ceil(size/2))
                     
        a = (self._dist(idx2,idx1))
        if(a > 0):
            dist_reward = 2000 - a
            reward += dist_reward
            #print("close to objective reward : {}".format(dist_reward))
            
    
        
        
        return(reward,smaller,self.data,world_state) # return r,s,data,extra_info
        '''
    def startworld(self,world_file):
        with open(world_file,'r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print ("Error starting mission:{}".format(e))
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        #print ("Waiting for the mission to start ")
        self.world_state = self.host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.host.getWorldState()
            for error in self.world_state.errors:
                print ("Error:",error.text)
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.observe(True)
    def quit(self):
        self.host.sendCommand('quit')
        
    def step(self,action):
        self.host.sendCommand(self.actions[action])
        return self.observe()

# Plots

In [4]:
def update(x,y,handle,plot):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

In [5]:
fig1 = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y")
rplot = fig1.line([],[],color="firebrick",line_width=2)
# make a grid
handle1 = show(fig1, notebook_handle=True)

# Test

In [62]:
fig_test = figure(plot_width=1000, plot_height=400,title="rewards_test",
                      x_axis_label="x",
                      y_axis_label="y")
test_plot = fig_test.line([],[],color="navy",line_width=2)
# make a grid
handle_test = show(fig_test, notebook_handle=True)

In [61]:
#agent = Agent((25,35,1),4,ractions)
env = Env(ractions,(300,420,3),(41,41))
r,s,data,ws = env.startworld('CliffWalking.xml')
done = ws.is_mission_running is False
root = Tk()
root_frame = Frame(root)
canvas = Canvas(root_frame, borderwidth=0, highlightthickness=0, width=200, height=130, bg="black" )
root_frame.pack()
canvas.pack()

frame_width = 25
frame_height = 35

count = 0
while(not done):
    try:
        r,s,data,ws = env.observe()
        count += 1
        #image = Image.frombytes('RGB', (frame.width,frame.height), bytes(frame.pixels) )


        #pre process
        #frame = s.reshape(frame_height,frame_width)
        #smaller = scimisc.imresize(fe,0.25,mode='L')
        #s = np.squeeze(s,2)
        s = np.squeeze(s,2)
        image = Image.fromarray(s.astype('int8'),mode='L')
        photo = ImageTk.PhotoImage(image)

        root.one = photo
        canvas.delete("all")
        canvas.create_image(frame_width,frame_height, image=photo)
        done = ws.is_mission_running is False
        update(count,r,handle_test,test_plot)
        root.update()
    except:
        root.destroy()

ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Mission Started.......























mission ended.
mission ended.


TclError: can't invoke "destroy" command: application has been destroyed

# Execution

In [6]:
from hyperdash import Experiment

In [63]:
simple_actions = {
    'strafe':{
        'left': 'strafe -0.5',
        'right': 'strafe 0.5'
    },
    'move':{
        'back':'move -0.5',
        'forward':'move 0.5'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])

In [36]:
agent = Agent((25,35,1),4,ractions)
env = Env(ractions,(300,420,3),(41,41))

ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test






In [None]:
#r,s,data,ws = env.startworld('CliffWalking.xml')

exp = Experiment('First Run')
  
gamma = exp.param("gamma", GAMMA)
memory = exp.param("memory", MEMORY_CAPACITY)
batch_size = exp.param("batch_size", BATCH_SIZE)
img_size = exp.param("image size", "25,35,1")
R = [0.0]

for t in range(10000):
    try:
        r,s,data,ws = env.startworld('CliffWalking.xml')
        done = ws.is_mission_running is False
        while(not done):
            send_a, a = agent.act(s)
            r,s_,data,ws = env.step(a)
            
            done = ws.is_mission_running is False
            if (not done):
                #normalize data
                s /= 255
                s_ /= 255

                #observe (agent)
                agent.observe((s,a,r,s_))

                #think
                agent.replay()
            s = s_
            R[-1] += r
            
        exp.metric('reward',R[-1])
        R.append(0.0)

        print('done play through {}'.format(t))
    except:
        print('END')
        exp.end()
        raise
    

{ gamma: 0.95 }
{ memory: 50000 }
{ batch_size: 64 }
{ image size: 25,35,1 }
Mission Started.....mission ended.
| reward: 167.090960 |
done play through 0
Mission Started.......mission ended.
| reward: -332.909040 |
done play through 1
Mission Started.......mission ended.
| reward: 167.090960 |
done play through 2
Mission Started.......mission ended.
| reward: -532.909040 |
done play through 3
Mission Started.......mission ended.
| reward: 167.090960 |
done play through 4
Mission Started.......mission ended.
| reward: 167.090960 |
done play through 5
Mission Started........mission ended.
| reward: -132.909040 |
done play through 6
Mission Started.......mission ended.
| reward: 167.090960 |
done play through 7
Mission Started........mission ended.
| reward: 167.090960 |
done play through 8
Mission Started.......

In [None]:
#clean up
exp.end()