In [1]:
# install OpenAI gym per https://gym.openai.com/docs/
# see scratch_lec05 for all basics of q learning and cartpole in gym
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import namedtuple
from random import randint
# env = gym.make('CartPole-v0')
import env_without_Airsim

In [2]:
env = env_without_Airsim.simplified_drone_env(2, 1)

In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class Replay:
    def __init__(self, max_length = 20000):
        self.memory = []
        self.length = 0
        self.max_length = max_length

    def push(self, *args):
        if len(self.memory) < self.max_length:
            self.memory.append(None)
        self.memory[self.length] = Transition(*args)
        self.length = (self.length + 1) % self.max_length

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
class Network:

    def __init__(self, session, n_in, n_out):
        self.session = session
        self.n_in = n_in
        self.n_out = n_out
        # TO be change
        self.n_hidden = 60
        self.x = tf.placeholder(tf.float32, [None, n_in], name = 'x')
        self.y = tf.placeholder(tf.float32, [None], name = 'y')
        self.a = tf.placeholder(tf.int32, [None], name = 'a')
        # ?
        self.x_in = tf.reshape(self.x, [-1, self.n_in])

        self.W_fc1 = tf.get_variable('W_fc1', shape = [self.n_in, self.n_hidden])
        self.b_fc1 = tf.get_variable('b_fc1', shape = [self.n_hidden])

        self.h_fc1 = tf.nn.relu(tf.add(tf.matmul(self.x_in, self.W_fc1), self.b_fc1), name = 'layer1')

        self.W_fc2 = tf.get_variable('W_fc2', shape = [self.n_hidden, self.n_out])
        self.b_fc2 = tf.get_variable('b_fc2', shape = [self.n_out])

        self.q = tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2, name = 'layer2')
        gather_indices = tf.range(tf.shape(self.x)[0]) * tf.shape(self.q)[1]  + self.a
        self.a_predict = tf.gather(tf.reshape(self.q, [-1]), gather_indices)
        
        self.loss = tf.reduce_sum(tf.square(self.y - self.a_predict))
        self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss)


    def compute(self, x):
        # evaluate the network and return the action values [q(s,a=0),q(s,a=1)]
        return self.session.run(self.q, feed_dict={self.x:np.reshape(x,[-1,self.n_in])})

    def train(self, x_batch, y_batch, action_batch):
        # take a training step
        _ = self.session.run(self.train_step, feed_dict={self.x: x_batch, self.y: y_batch, self.a: action_batch})

In [5]:
def step(env, action):
    x, y, z, angle, _, done, reward = env.step(action + 2)
    state = [x, y, angle]
    return state, reward, done

In [6]:
class Agent: 
    
    def __init__(self, tf_session):
        self.n_in = 3
        self.n_out = 6
        # first what reward has the agent accrued so far
        self.total_reward = 0 
        # discount, learning, exploration rates, batch size
        self.gamma = 0.99
        self.epsilon = 1.0
        self.batch_size = 50
        # make an experience replay buffer
        self.replay_buffer = Replay()
        # make the network that will be the q function
        self.q = Network(tf_session, self.n_in , self.n_out)           
        
    def gather_experience(self, last_observation, action, reward, observation):
        # push this experience onto the replay buffer
        self.replay_buffer.push(last_observation, action, observation, reward)
            
    def choose_action(self, observation):
        # behave according to an epsilon greedy policy
        # behave according to an epsilon greedy policy
        if np.random.rand() > self.epsilon:
            return np.argmax(self.q.compute(observation))
        else:
            # explore
            return randint(0, 5)
    
        
    def q_update(self):
        # pull a batch from the buffer
        if len(self.replay_buffer) < self.batch_size:
            return
        transitions = self.replay_buffer.sample(self.batch_size)
        sars_batch = Transition(*zip(*transitions))
#         q_last = self.q.compute(sars_batch.state)
        
        # compute the q function for all last_obs and obs
        #q_last = self.q.compute([s[0] for s in sars_batch])
        # q_next for current obs requires a bit more attention, since done flag means q should be zero
        q_this = np.zeros([self.batch_size, self.n_out]) # initialize q to zeros
        ind_not_none = list(map(lambda s: s is not None, sars_batch.next_state))
        none_final_next_state = [s for s in sars_batch.next_state if s is not None]
        q_this[ind_not_none] = self.q.compute(none_final_next_state)
        
        
#         q_this_not_none = self.q.compute([sb[3] for sb in sars_batch if sb[3] is not None])
#         # now fill q_this with just the valid q, leaving others [0,0]
#         for i in range(len(ind_not_none)):
#             q_this[ind_not_none[i],:] = q_this_not_none[i,:]
        # a list comprehension is nice but 5x inefficient... want to pass tensorflow a batch block
        # q_this = [([0,0] if s[3] is None else self.q.compute(s[3])) for s in sars_batch]
        # now chunk this up as the train_step expects
#         x_batch = np.zeros([self.batch_size,self.n_in])
        x_batch = sars_batch.state
        next_state_values = np.zeros(self.batch_size)
        next_state_values[ind_not_none] = np.max(q_this[ind_not_none], 1)
#         for i in range(np.shape(sars_batch)[0]):
#             x_batch[i,:] = sars_batch[i][0]
#             for j in range(2):
#                 if j == sars_batch[i][1]:
#                     # the key step... this is the q learning target
#                     y_batch[i,j] = sars_batch[i][2] + self.gamma*np.max(q_this[i])
#                 else:
#                     y_batch[i,j] = q_last[i][j]
        # now run the train step
        y_batch = next_state_values * self.gamma + sars_batch.reward
        self.q.train(x_batch, y_batch, sars_batch.action)

    def set_epsilon(self,episode):
        # decay exploration from 0.5 to 0.1, but slowly
        # NOTE: performance depends on these decay schedules! Experiment!
        #self.epsilon = 0.01 + (0.5-0.01)/(1 + episode)
        # Note: here is a rate schedule suggested by https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
        self.epsilon = 0.01 + (1.0 - 0.01)*np.exp(-.001*episode)
        
    def reset_epsilon(self):
        # reset method for running greedy after training
        self.epsilon = 0.0

    def gather_reward(self, reward):
        self.total_reward += reward
    def get_total_reward(self):
         return self.total_reward
    def set_total_reward(self, new_total):
         self.total_reward = new_total

In [None]:
with tf.Graph().as_default():
    ep_rewards = []
    with tf.Session() as sess:
        # create an agent
        agent = Agent(sess)
        # usual tf initialization
        sess.run(tf.global_variables_initializer())      
        ####
        # Q-learn (train) DQN on CartPole
        ####
        for ep in range(1501): 
            # reset environment and agent
            last_observation = env.reset()
            agent.set_total_reward(0)
            x = []
            y = []
            # done at T==199 so no reason to go further
            for t in range(1000):
                # agent chooses an action
                action = agent.choose_action(last_observation)
                # agent takes the action, and the environment responds
                observation, reward, done = step(env, action)
                x.append(observation[0])
                y.append(observation[1])
                # check for fail state
                if done==True:
                    observation = None
                # update agent with reward and data
                agent.gather_reward(reward)
                agent.gather_experience(last_observation, action, reward, observation)
                # update q function, which will use the memory
                agent.q_update()
                # iterate
                last_observation = observation
                if done==True:
                    ep_rewards.append(agent.get_total_reward())
                    break
            # print progress...
            if (ep+1) % 50 == 0:
                fig = plt.figure()
                ax = fig.add_subplot(111)
                ax.plot(x, y)
                plt.savefig('image%s.pdf'%ep)
                print('After {} episodes, last 50 rewards averaged {}'.format(ep+1, np.mean(ep_rewards[-50:])))
            # update rates
            agent.set_epsilon(ep)
        plt.plot(ep_rewards, linewidth=2)
        plt.xlabel('episode')
        plt.ylabel('total reward per episode')
        plt.title('DQN CartPole q-learning (training)')
        plt.show()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


position: 1.0 0.0 0 ; distance to certer: 2.2360679775 ; Step Reward: -2.47651122074
position: 0.0 0.0 0 ; distance to certer: 2.82842712475 ; Step Reward: -2.61574990411
position: 0.0 0.0 0 ; distance to certer: 2.82842712475 ; Step Reward: -2.94908323744
position: -0.5 -0.866025403784 0 ; distance to certer: 3.80316994297 ; Step Reward: -3.31579337719
position: -1.36602540378 -0.366025403784 0 ; distance to certer: 4.11438977617 ; Step Reward: -3.32406775378
position: -0.5 -0.866025403784 0 ; distance to certer: 3.80316994297 ; Step Reward: -3.31579337719
position: 0.366025403784 -1.36602540378 0 ; distance to certer: 3.74165738677 ; Step Reward: -3.36151030132
position: 1.23205080757 -1.5 0 ; distance to certer: 3.58325912573 ; Step Reward: -100
position: 0.732050807569 -1.5 0 ; distance to certer: 3.72259253137 ; Step Reward: -100
position: 0.732050807569 -1.5 0 ; distance to certer: 3.72259253137 ; Step Reward: -3.03634107185
position: 0.732050807569 -1.5 0 ; distance to certer: 3

  area = ( S * (S - l1) * (S - l2) * (S - l12) ) ** 0.5


position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -2.43234186903
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -2.76567520237
position: 2.40192378865 5.5 0 ; distance to certer: 3.52300194889 ; Step Reward: -100
position: 2.40192378865 5.5 0 ; distance to certer: 3.52300194889 ; Step Reward: -2.67682845606
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -100
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -2.0990085357
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -100
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -1.76567520237
position: 2.90192378865 5.5 0 ; distance to certer: 3.61434178247 ; Step Reward: -2.0990085357
position: 1.90192378865 5.5 0 ; distance to certer: 3.5013738651 ; Step Reward: -2.56727702864
position: 1.90192378865 5.5 0 ; distance to certer: 3.5013738651 ; Step R