In [7]:
# Simple env test.
import json
import select
import time
import logging
import os

import gym
import snake_gym
import minerl
import random

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from collections import deque

env = gym.make("MineRLNavigateDense-v0")

In [8]:
class dqn_network():
    def __init__(self):
        self.state = tf.placeholder(shape=[None,64,64,4], dtype=tf.float32)
        self.conv1 = tf.layers.conv2d(inputs=self.state, filters=32, kernel_size=[8,8], strides=[4,4], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=[4,4], strides=[2,2], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=[3,3], strides=[1,1], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.flat = tf.layers.flatten(self.conv3)
        self.out = tf.layers.dense(self.flat, 3, activation=tf.nn.softmax)
        self.predict = tf.argmax(self.out, 1)

        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.action, 3, dtype=tf.float32)
        self.Q = tf.reduce_sum(tf.multiply(self.out, self.actions_onehot), axis=1)

        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        self.td_error = tf.square(self.targetQ - self.Q)
            
        self.loss = tf.reduce_mean(self.td_error)
        self.train_step = tf.train.AdamOptimizer(0.001).minimize(self.loss)

In [9]:
def converter(observation):
    region_size = 8
    obs = observation['pov']
    obs = obs / 255
    compass_angle = observation['compassAngle']

    compass_angle_scale = 180
    compass_scaled = compass_angle / compass_angle_scale
    compass_channel = np.ones(shape=list(obs.shape[:-1]) + [1], dtype=obs.dtype) * compass_scaled
    obs = np.concatenate([obs, compass_channel], axis=-1)

    return obs

In [10]:
def main():
    #env = gameEnv(partial=False, size=5)
    #env = BattlesnakeGym(number_of_snakes=1, map_size=(10, 10))
    
    annealing_episodes = 100
    startE = 1.0
    endE = 0.1
    e = startE
    stepDrop = (startE - endE) / annealing_episodes
    
    network = dqn_network()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run(init)
    
    variables = tf.trainable_variables(scope=None)
    saver = tf.train.Saver(variables, max_to_keep=5)
    
    model_path = '/Users/eyifjin/COALA/Phd/SummerSchool2020/minecraftrl/src/MineRLObtainDiamondVectorObf-v0'
    ckpt = tf.train.get_checkpoint_state(model_path)
    # saver.restore(sess, ckpt.model_checkpoint_path)
    
    episodeBuffer = deque()
    total_steps = 0
    rList = []
    for i in range(annealing_episodes):
        # Reset environment and get first new observation
        obs = env.reset()
        s = converter(obs)

        d = False
        rAll = 0
        steps = 0

        if e > endE:
            e -= stepDrop
        
        # The Q-Network
        while True: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            #env.render()
            #time.sleep(0.5)
            steps += 1
            total_steps += 1
            
            # Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e:
                action_index = np.random.randint(0,3)
            else:
                action_index = sess.run(network.predict, feed_dict={network.state:[s]})[0]

            #print("action_index: " + str(action_index))
            
            action = env.action_space.noop()
            if (action_index == 0):
                action['camera'] = [0, -5]
            elif (action_index == 1):
                action['camera'] = [0, 5]
            elif (action_index == 2):
                action['forward'] = 1

            action['jump'] = 1
            action['attack'] = 1
            
            obs1, r, d, _ = env.step(action)
            s1 = converter(obs1)
            
            episodeBuffer.append((s,action_index,r,s1,d))
            if len(episodeBuffer) > 50000:
                episodeBuffer.popleft()

            if total_steps % 500 == 0:
                saver.save(sess, model_path + '/model-' + str(total_steps) + '.cptk')
                
            batch_size = 512
            if total_steps % (batch_size) == 0:
                trainBatch = random.sample(episodeBuffer, batch_size)

                s_batch = [d[0] for d in trainBatch]
                a_batch = [d[1] for d in trainBatch]
                d_batch = [d[4] for d in trainBatch]
                d_batch = (np.array(d_batch)).astype(int)

                r_batch = [d[2] for d in trainBatch]
                s1_batch = [d[3] for d in trainBatch]
                
                #allQ = sess.run(network.Q, feed_dict={network.state:[trainBatch[:,0]]})
                Q1 = sess.run(network.out, feed_dict={network.state:s1_batch})
                end_multiplier = -(d_batch - 1)
                targetQ = r_batch + 0.99 * np.max(Q1, axis=1) * end_multiplier
                
                #print("train network")
                _ = sess.run(network.train_step, feed_dict={network.state:s_batch, 
                                                            network.targetQ:targetQ,
                                                            network.action:a_batch})
            

            rAll += r
            s = s1
            
            
            if d == True:
                break
        
        #jList.append(j)
        print("rAll: " + str(rAll))
        rList.append(rAll)
        
        if len(rList) % 10 == 0:
            print(i, np.mean(rList[-10:]), e)

if __name__ == "__main__":
    main()

Instructions for updating:
Use standard file APIs to delete files with this prefix.
rAll: -46.02125549316406


TypeError: a bytes-like object is required, not 'NoneType'