In [1]:
import pdb;
import scipy.misc as scimisc

from tkinter import *
from PIL import Image
from PIL import ImageTk

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.animation as animation
from PIL import Image

import MalmoPython
import os
import sys
import time
import random
import json
import numpy as np
import time
from IPython.display import clear_output,display
import logging
import math


import gym
import gym_minecraft
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers

import baselines.common.tf_util as U

from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer,PrioritizedReplayBuffer
from baselines.common.schedules import LinearSchedule

In [2]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show
from bokeh.driving import linear
from bokeh.layouts import row,gridplot
from IPython.display import clear_output,display
import bokeh
output_notebook()

In [3]:
from keras import backend as K
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Activation,GRU,Input,LSTM,Conv2D,Flatten
from keras.optimizers import RMSprop,Adam
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [4]:
import gym
import gym_minecraft
from MinecraftGym import MinecraftWrapper

In [None]:
class Network:
    def __init__(self,stateCnt,actionCnt,learning_rate):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.learning_rate = learning_rate
    def build(self):
        model = Sequential()
        # Lets try a CNN to take screen as input
        # batch size is 64, 320x240 video RGB channels with an extra channel for depth
        frame_width = self.stateCnt[1]
        frame_height = self.stateCnt[0]
        model.add(Conv2D(32,(8,8),input_shape=self.stateCnt,activation='relu'))
        model.add(Conv2D(64,(4,4),activation='relu'))
        model.add(Conv2D(64,(3,3),activation='relu'))
        model.add(Flatten())
        model.add(Dense(60,activation='relu'))
        model.add(Dense(32,activation='relu'))
        model.add(Dense(output_dim=self.actionCnt,activation='softmax'))
        
        model.compile(loss=self._huber_loss,optimizer=Adam(lr=self.learning_rate))
        return model
    def _huber_loss(self, target, prediction):
        # sqrt(1+error^2)-1
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)

In [None]:
class Memory:   # stored as ( s, a, r, s_ )
    def __init__(self, capacity):
        self.samples = []
        self.capacity = capacity
    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

In [None]:
class Agent:
    def __init__(self,stateCnt,actionCnt,capacity,epsilon,gamma,learning_rate):
        self.stateCnt = stateCnt 
        self.actionCnt = actionCnt
        self.model = Network(stateCnt,actionCnt,learning_rate).build() # model
        self.target_model = Network(stateCnt,actionCnt,learning_rate).build() # target model
        
        self.epsilon = epsilon
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.gamma = gamma
        self.steps = 0
        self.memory = Memory(capacity)

    def remember(self,sample):
        self.memory.add(sample)
    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())
    def act(self, s):
        # Epsilon greedy action selection
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.actionCnt)
        act_values = self.model.predict(s)
        return np.argmax(act_values[0]) # returns action
    def replay(self, batch_size):
        minibatch = self.memory.sample(batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Environment

In [None]:
def preprocess(rgb_array,scale = 1/12):
    frame_shape = rgb_array.shape
    
    frame = np.array(rgb_array)
    gray_frame = np.dot(frame[...,:3],[0.299,0.587,0.114]).reshape((frame_shape[0],frame_shape[1]))
    smaller = scimisc.imresize(gray_frame,scale,mode='L').astype('float64')
    smaller /= 255.0
    smaller = np.expand_dims(smaller,2) # convert to a 3D array of shape (height,width,grayscale)
    smaller = np.reshape(smaller, [1, *(smaller.shape)])
    return smaller.astype("uint8")

In [10]:
def render(obs,root,canvas):
    obs = np.squeeze(obs,2)
    image = Image.fromarray(obs.astype('int8'),mode='L')
    photo = ImageTk.PhotoImage(image)
    root.one = photo
    canvas.delete("all")
    canvas.create_image(frame_height,frame_width, image=photo)
    root.update()

## Test

In [None]:
# Test

root = Tk()
root_frame = Frame(root)
canvas = Canvas(root_frame, borderwidth=0, highlightthickness=0, width=200, height=130, bg="black" )
root_frame.pack()
canvas.pack()

frame_height = 25
frame_width = 35


env = gym.make("MinecraftBasic-v0")
env.load_mission_file("./CliffWalking.xml")
env.init(videoResolution=[420,300],allowContinuousMovement=["move", "turn", "strafe"])


scale = 1/12 # scale image down by 1/12
newshape = (env.video_height*scale,env.video_width*scale,1) # dimension of 1 for grayscale
newshape = tuple(map(int,newshape))

# the pre processor will adjust the observation space therefore we will edit the property of the environment to take the pre processor into accoutn
env.observation_space = gym.spaces.Box(low=0, high=255,
shape=newshape)

done = False

for i in range(1000):
    try:
        env.reset()
        while True:
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            proc_obs = preprocess(obs)
            
            render(proc_obs,root_frame,canvas)

            if done:
                break
    except:
        root.destroy()
        env.close()
        raise
env.close()
root.destroy()


# Plots

In [12]:
def update(x,y,handle,plot):
    plot.data_source.data['x'] += [x]
    plot.data_source.data['y'] += [y]
    push_notebook(handle=handle)

In [13]:
inferno = bokeh.palettes.Inferno9
fig1 = figure(plot_width=400, plot_height=400,title="rewards",
                      x_axis_label="x",
                      y_axis_label="y")
rplot = fig1.line([],[],line_width=2)
# make a grid
handle1 = show(fig1, notebook_handle=True)

# Execute

In [5]:
pre_env = gym.make("MinecraftCliffWalking1-v0")
pre_env.init(videoResolution=[400,400],allowContinuousMovement=["move", "turn", "strafe"],observeGrid=[20,-1,20,20,-1,20],observeDistance=[4,45,12])
env = MinecraftWrapper(pre_env,1/10,(41,41))

In [None]:
atari_env = gym.make('SpaceInvaders-v0')

In [None]:

def train(env,episodes,stateCnt,actionCnt,memory_capacity,epsilon,gamma,learning_rate)
    # play loop
    exp = Experiment("DQN - Space Invaders")
    batch_size = 64
    agent = Agent(stateCnt,actionCnt,memory_capacity,epsilon,gamma,learning_rate)
    for e in range(episodes):
        R = [0.0]
        pre_s = env.reset()
        s = preprocess(pre_s,1/5)
        done = False

        for t in itertools.count():
            a = agent.act(s)

            pre_s_, r, done, info = env.step(a)
            s_ = preprocess(pre_s_,1/5)
            agent.remember((s,a,r,s_,done))
            s = s_
            R[-1] += r
            if done:
                agent.update_target_model()
                #print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, t, agent.epsilon))
                exp.metric("reward",R[-1])
                update(e,R[-1],handle1,rplot)
                R.append(0.0)
                break
        agent.replay(batch_size)
    
    

In [None]:
(42,32,1)

In [13]:
train(atari_env,(42,32,1),atari_env.action_space.n,10000,1,0.99,0.00025)

Object `env.reset` not found.
