# DQN from openAI

## Libs

In [1]:
import MalmoPython
import os
import sys
import time
import random
import json
import itertools
import math

from baselines import deepq
from baselines import logger
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule


import tensorflow as tf
import tensorflow.contrib.layers as layers
import baselines.common.tf_util as U


from IPython.display import clear_output,display
import numpy as np

from keras import backend as Kend
from keras.layers import GRU,Dense,Activation,Input,LSTM
from keras.models import Sequential

from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
output_notebook()

Using TensorFlow backend.


In [17]:
import pdb;
import scipy.misc as scimisc

from tkinter import *
from PIL import Image
from PIL import ImageTk

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.animation as animation
from PIL import Image

In [19]:
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2110787999399032106
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 11324823962
locality {
  bus_id: 1
}
incarnation: 18147453029250201654
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0"
]


## Environment

In [3]:
class Env:
    def __init__(self,actions,obs_shape,grid_shape,scale = 1/12):
        self.world_state = None
        self.my_mission_record = MalmoPython.MissionRecordSpec()
        self.data = None
        #self.observation_space = np.zeros(shape=(obs_shape**2,))
        self.obs_shape = obs_shape
        

        self.actions = actions
        self.scale = scale
        self.grid_shape = grid_shape
        self.host = MalmoPython.AgentHost()
        self.obs = None
        try:
            self.host.parse( sys.argv )
        except RuntimeError as e:
            print ('ERROR:',e)
            print (self.host.getUsage())
            if self.host.receivedArgument("help"):
                print (self.host.getUsage())
                exit(0)
    def _dist(self,x,y):
        return np.sqrt(np.sum((x-y)**2))
    
    def waitForInitialState( self ):
        '''Before a command has been sent we wait for an observation of the world and a frame.'''
        # wait for a valid observation
        world_state = self.host.peekWorldState()
        while world_state.is_mission_running and all(e.text=='{}' for e in world_state.observations):
            world_state = self.host.peekWorldState()
        # wait for a frame to arrive after that
        num_frames_seen = world_state.number_of_video_frames_since_last_state
        while world_state.is_mission_running and world_state.number_of_video_frames_since_last_state == num_frames_seen:
            world_state = self.host.peekWorldState()
        world_state = self.host.getWorldState()

        reward = 0
        smaller = None
        data = None
        if world_state.is_mission_running:
                
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            
            obs = json.loads( world_state.observations[-1].text )
            frame = world_state.video_frames[-1]
            reward,smaller,data,_ = self.process(world_state)
            return reward,smaller,data,world_state
        else:
            return None
    def waitForNextState( self ):
        '''After each command has been sent we wait for the observation to change as expected and a frame.'''
        # wait for the observation position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if not world_state.is_mission_running:
                print('mission ended.')
                break
            if not all(e.text=='{}' for e in world_state.observations):
                obs = json.loads( world_state.observations[-1].text )
                break
        # wait for the render position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if len(world_state.video_frames) > 0:
                frame = world_state.video_frames[-1]
                break
            if not world_state.is_mission_running:
                break

        reward = 0
        smaller = None
        data = None
        num_frames_before_get = len(world_state.video_frames)
        world_state = self.host.getWorldState()

        if world_state.is_mission_running:
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            num_frames_after_get = len(world_state.video_frames)
            assert num_frames_after_get >= num_frames_before_get, 'Fewer frames after getWorldState!?'
            frame = world_state.video_frames[-1]
            reward,smaller,data,_ = self.process(world_state)
        return reward,smaller,data,world_state
    def process(self,world_state):
        obs = json.loads( world_state.observations[-1].text )
        reward = 1
        if world_state.number_of_rewards_since_last_state > 0:
            reward = world_state.rewards[0].getValue()
        


        
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in obs['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)

        # read frame into numpy array (height,width,color(RGB))
        frame = np.array(world_state.video_frames[-1].pixels).reshape(self.obs_shape)
        
        # grayscale
        gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((self.obs_shape[0],self.obs_shape[1]))
        
        # scale down
        # scimisc will output an array of type uint8 for further preprocessing the data needs to be casted to float
        smaller = scimisc.imresize(gray_frame,1/12,mode='L').astype('float64')
        smaller = np.expand_dims(smaller,2)
        
        # compute reward depending on distance to target
        new_state = np.array(vec)
        tmp = np.array(vec).reshape(self.grid_shape)
        idx2 = np.argwhere(tmp == 2)
        
        size = self.grid_shape[0]
        idx1 = (np.ceil(size/2),np.ceil(size/2))
                     
        a = (self._dist(idx2,idx1))
        if(a > 0):
            try:
                dist_reward = 2000/(a)
            except:
                dist_reward = 0
            reward += dist_reward
            #print("close to objective reward : {}".format(dist_reward))
        return(reward,smaller,self.data,world_state) # return r,s,data,extra_info
    
    def observe(self,init=False):
        if( init ):
            tmp = self.waitForInitialState()
            while(tmp == None):
                tmp = self.waitForInitialState()
            return tmp
        else:
            return self.waitForNextState()
        # wait for the observation position to have changed
        '''
        while True:
            world_state = self.host.peekWorldState()

            if not all(e.text=='{}' for e in world_state.observations):
                self.obs = json.loads( world_state.observations[-1].text )
                break
            if not world_state.is_mission_running:
                break
                
        
                # wait for the render position to have changed
        while True:
            world_state = self.host.peekWorldState()
            if not world_state.is_mission_running:
                break
            if len(world_state.video_frames) > 0:
                break
        num_frames_before_get = len(world_state.video_frames)
        world_state = self.host.getWorldState()
        
        if world_state.is_mission_running:
            assert len(world_state.video_frames) > 0, 'No video frames!?'
            num_frames_after_get = len(world_state.video_frames)
            assert num_frames_after_get >= num_frames_before_get, 'Fewer frames after getWorldState!?'
            frame = world_state.video_frames[-1]
            self.obs = json.loads( world_state.observations[-1].text )
        
        self.obs = json.loads( world_state.observations[-1].text )
        reward = 1
        if world_state.number_of_rewards_since_last_state > 0:
            reward = world_state.rewards[0].getValue()
        


        
        # reformat grid to a vector that only show the floor with blocks
        vec = []    
        for item in self.obs['grid'][::-1]:
            if 'lava' in item:
                vec.append(1)
            elif 'lapis' in item:
                vec.append(2)
            else:
                vec.append(0)

        # read frame into numpy array (height,width,color(RGB))
        frame = np.array(world_state.video_frames[-1].pixels).reshape(self.obs_shape)
        
        # grayscale
        gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((self.obs_shape[0],self.obs_shape[1]))
        
        # scale down
        smaller = scimisc.imresize(gray_frame,1/12,mode='L')
        smaller = np.expand_dims(smaller,2)
    
        # compute reward depending on distance to target
        new_state = np.array(vec)
        tmp = np.array(vec).reshape(self.grid_shape)
        idx2 = np.argwhere(tmp == 2)
        
        size = self.grid_shape[0]
        idx1 = (np.ceil(size/2),np.ceil(size/2))
                     
        a = (self._dist(idx2,idx1))
        if(a > 0):
            dist_reward = 2000 - a
            reward += dist_reward
            #print("close to objective reward : {}".format(dist_reward))
            
    
        
        
        return(reward,smaller,self.data,world_state) # return r,s,data,extra_info
        '''
    def startworld(self,world_file):
        with open(world_file,'r') as f:
            my_mission = MalmoPython.MissionSpec(f.read(), True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        # Attempt to start a mission:
        max_retries = 3
        for retry in range(max_retries):
            try:
                self.host.startMission( my_mission, my_mission_record )
                sys.stdout.write("Mission Started")
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print ("Error starting mission:{}".format(e))
                    exit(1)
                else:
                    time.sleep(2)
        # Loop until mission starts:
        #print ("Waiting for the mission to start ")
        self.world_state = self.host.getWorldState()
        while (not self.world_state.has_mission_begun):
            sys.stdout.write(".")
            time.sleep(0.1)
            self.world_state = self.host.getWorldState()
            for error in self.world_state.errors:
                print ("Error:",error.text)
                
        ## wait until a valid observation        
        while self.world_state.is_mission_running and all(e.text=='{}' for e in self.world_state.observations):
            self.world_state = self.host.peekWorldState()
        #populate emtpy fields for init
        self.data = json.loads(self.world_state.observations[-1].text)
        
        return self.observe(True)
    def quit(self):
        self.host.sendCommand('quit')
        
    def step(self,action):
        self.host.sendCommand(self.actions[action])
        return self.observe()

## Plots

In [4]:
def update(x,y,handle,plot):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

In [5]:
fig1 = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y")
rplot = fig1.line([],[],color="firebrick",line_width=2)
# make a grid
handle1 = show(fig1, notebook_handle=True)

## Execution

In [6]:
simple_actions = {
    'strafe':{
        'left': 'strafe -1',
        'right': 'strafe 1'
    },
    'move':{
        'back':'move -1',
        'forward':'move 1'
    }   
}
# flatten dict of actions
ractions = []
for action_type in simple_actions.keys():
    
    for action in simple_actions[action_type]:
        ractions.append(simple_actions[action_type][action])

In [27]:
def model(inpt, nact, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        h = conv(tf.cast(inpt, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
        h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
        h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
        h3 = conv_to_fc(h3)
        h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
        pi = fc(h4, 'pi', nact, act=lambda x:x)
        vf = fc(h4, 'v', 1, act=lambda x:x)
        return vf
def new_model(inpt,num_actions,scope,reuse=False):
    
    with tf.variable_scope(scope,reuse=reuse):
        model = Sequential()
        model.add(Dense(32,activation='relu',input_shape=(25,35,1)))
        model.add(Dense(num_actions,activation="softmax"))
        return model.output

In [28]:
def Train(env,world):
    U.reset()
    with U.make_session(2) as sess:
        replay_buffer = ReplayBuffer(5000)
        exploration = LinearSchedule(schedule_timesteps=1000, initial_p=0.9, final_p=0.5)
        episode_rewards = [0.0]
        
        Kend.set_session(sess)
        
        r,s,data,ws = env.startworld(world)
        
        
        act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: U.BatchInput((25,35,1), name=name), # prev 49
        q_func=new_model,
        num_actions= len(env.actions), #prev len(ractions)
        optimizer=tf.train.AdamOptimizer(learning_rate=0.5)
        )
        U.initialize()
        update_target()
        
        R = 0
        episode = 0
        for t in itertools.count():
            # exploration schedule update_eps=exploration.value(t)
            update_eps=exploration.value(t)
            action = act(s[None])[0]

            r,s_,data,ws = env.step(action)

            done = ws.is_mission_running is False
            replay_buffer.add(s,action,r,s_,done)


            s = s_

            episode_rewards[-1] += r
            
            R += r
            #writer = tf.summary.FileWriter("logs", sess.graph)
            if done: #mission is done
                _,s,_,ws = env.startworld('CliffWalking.xml')
                clear_output(wait=True) 
                display("mission done reward : {} @ t = {}".format(episode_rewards[-1],t))
                update(episode,episode_rewards[-1],rewards_plot,handle)
                episode_rewards.append(0)
                episode+=1
                time.sleep(0.5) # give env time to reset
                #s = gym_env.reset()


            is_solved = t > 100 and np.mean(episode_rewards) >= 10000
            
            if t%100 == 0:
                display('reward @ t= {} is {}'.format(t,r))
            
            if is_solved:
                # Show off the result
                whaa = 5+2
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()
                    clear_output(wait=True)             
                    display(r)

                if t % 100 == 0 and t > 5:
                    #display(r)
                    whaa = 5

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

## Execute

In [29]:
env = Env(ractions,(300,420,3),(41,41))
Train(env,'CliffWalking.xml')

ERROR: unrecognised option '-f'
Malmo version: 0.31.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Mission Started....

ValueError: Shapes must be equal rank, but are 1 and 3 for 'deepq/Select' (op: 'Select') with input shapes: [?], [?], [?,35,4].