In [1]:
import pandas as pd
from keras.models import load_model, Sequential, Model
from keras.initializers import RandomUniform
from keras.regularizers import l2
from keras.layers.wrappers import TimeDistributed
from keras import backend as K
from keras.layers import Dense, Dropout, Input, MaxPooling1D,Conv1D,LSTM, Flatten, Reshape
from keras.layers.merge import Add, Multiply, Concatenate
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import time
import numpy as np
import gym
import random
from collections import deque
import tensorflow as tf
import os

    
class Actor_Critic:
    def __init__(self, nx, ny, s_link, sess, batch):
        self.nx = nx[0]
        self.nx_lidar = 10
        self.nx_obs = 14
        self.ny = ny[0]
        self.lr_actor = 1e-4
        self.lr_critic = 3e-4
        self.batch = batch
        self.gamma = 0.99
        self.alpha = 0.1
        self.s_link =s_link 
        self.sess = sess
        self.deck = deque(maxlen=4000)
        self.e = 1.0
        self.e_= 0.01
        self.dc= 0.9999
        self.tau = 1e-3
        self.weight_decay = 0.0001
        self.los = []
        self.parameters={'lr_actor': self.lr_actor,'lr_critic':self.lr_critic,'gamma':self.gamma,
                         'alpha':self.alpha, 'tau':self.tau,'dc':self.dc,'Batch':self.batch}
        
        

        #Actor-Critic
        self.actor_lidar_input, self.actor_state_input, self.actor_local = self.Actor()      
        _,_, self.actor_target = self.Actor()
        self.actor_critic_grads = tf.placeholder(tf.float32, [None,self.ny])
        actor_local_weights = self.actor_local.trainable_weights
        self.actor_grads = tf.gradients(self.actor_local.output, actor_local_weights, -self.actor_critic_grads)
        grads = zip(self.actor_grads, actor_local_weights)
        self.optimize = tf.train.AdamOptimizer(self.lr_actor).apply_gradients(grads)
  

        self.critic_lidar_input, self.critic_state_input, self.critic_action_input, self.critic_local = self.Critic()      
        _,_, _, self.critic_target = self.Critic()
        self.critic_grads = tf.gradients(self.critic_local.output,  self.critic_action_input)
        self.sess.run(tf.global_variables_initializer())
        self.ep_rewards=[]
        
    def choose_action(self,observation):
        if np.random.rand() <= self.e : 
            action = np.random.uniform(-1,1,4)
            return action       
        state = observation[0][:14].reshape((1,14))
        lidar = observation[0][14:].reshape((1,10,1))
        action = self.actor_local.predict([lidar,state])
        return action
                    
    def storing(self, observation, action, reward, observation_new, flags ):
        self.deck.append((observation, action, reward, observation_new, flags ))
        self.ep_rewards.append(reward)
        
    def save(self,name):
        self.actor_local.save(name)
        self.critic_local.save(name)

    def Actor(self):                     
        lidar_input = Input(shape=(self.nx_lidar,1))
        lidar_conv = Conv1D(64, 4, activation='relu')(lidar_input)
        pool = MaxPooling1D(4)(lidar_conv)
        flat = Flatten()(pool)
               
        state_input = Input(shape=(self.nx_obs,))
        state_h1 = Dense(192, activation='relu')(state_input)
        
        merged = Concatenate()([flat,state_h1])
        merged_reshaped = Reshape((256,1))(merged)
        merged_lstm = LSTM(256,activation='relu',input_shape=(1,256,1))(merged_reshaped)
        output = Dense(self.ny, activation='tanh')(merged_lstm)
        
        model = Model(input=[lidar_input,state_input], output=output)
        adam = Adam(lr=self.lr_actor)
        model.compile(loss='mse', optimizer=adam)
        return lidar_input,state_input, model

    def Critic(self):                     
        lidar_input = Input(shape=(self.nx_lidar,1))
        lidar_conv = Conv1D(64, 4, activation='relu',input_shape=(self.nx_lidar,1))(lidar_input)
        pool = MaxPooling1D(4)(lidar_conv)
        flat= Flatten()(pool)
        
        state_input = Input(shape=(self.nx_obs,))
        state_h1 = Dense(192, activation='relu')(state_input)
        
        action_input = Input(shape=(self.ny,))
        action_h1    = Dense(64, activation='relu')(action_input)
        
        merge1 = Concatenate()([flat,state_h1])
        merged_dense = Dense(256, activation='relu')(merge1)
        
        merge2 = Concatenate()([merged_dense,action_h1])
        merge2reshaped = Reshape((320,1))(merge2)
        merge_lstm = LSTM(320, activation='relu',input_shape=(1,320,1))(merge2reshaped)
        output= Dense(1,activation='linear')(merge_lstm)
        
        model  = Model(input=[lidar_input,state_input,action_input], output=output)
        adam  = Adam(lr=self.lr_critic)
        model.compile(loss="mse", optimizer=adam)
        return lidar_input,state_input, action_input, model
    

    def _train_critic(self, sample_indx):
        for observation, act, reward, obs_new, done in sample_indx:  
            Q_target = np.array(reward).reshape(1,-1)
            act = act.reshape(1,-1)
            state = observation[0][:14].reshape((1,14))
            lidar = observation[0][14:].reshape((1,10,1))
            state_new = obs_new[0][:14].reshape((1,14))
            lidar_new = obs_new[0][14:].reshape((1,10,1))
            if not done:
                target_action = self.actor_target.predict([lidar_new,state_new])
                future_reward = self.critic_target.predict([lidar_new,state_new, target_action])[0][0]
                Q_target =(1-self.alpha)*Q_target +  self.alpha* self.gamma * future_reward
                Q_target = Q_target.reshape(1,-1)
            self.critic_local.fit(x=[lidar,state,act],y=Q_target, verbose=0, epochs=1)   
            
            
    def _train_actor(self, sample_indx):
        for observation, act, reward, observation_new, _ in sample_indx:
            state = observation[0][:14].reshape((1,14))
            lidar = observation[0][14:].reshape((1,10,1))

            predicted_action = self.actor_local.predict([lidar,state])
            grads = self.sess.run(self.critic_grads, feed_dict = {
                    self.critic_lidar_input : lidar,
                    self.critic_state_input: state,
                    self.critic_action_input: predicted_action})[0]
            
            self.sess.run(self.optimize, feed_dict={
                    self.actor_lidar_input: lidar,
                    self.actor_state_input: state,
                    self.actor_critic_grads: grads})            
            
    def _update_actor_target(self):
        actor_local_weights  = self.actor_local.get_weights()
        actor_target_weights =self.actor_target.get_weights()
        
        for i in range(len(actor_target_weights)):
            actor_target_weights[i] = self.tau*actor_local_weights[i] + (1-self.tau)*actor_target_weights[i]
        self.actor_target.set_weights(actor_target_weights)          
            
    def _update_critic_target(self):
        critic_local_weights  = self.critic_local.get_weights()
        critic_target_weights = self.critic_target.get_weights()

        for i in range(len(critic_target_weights)):
            critic_target_weights[i] = self.tau*critic_local_weights[i] + (1-self.tau)*critic_target_weights[i]
        self.critic_target.set_weights(critic_target_weights)		

    def update_target(self):
        self._update_actor_target()
        self._update_critic_target()
            
      
    def TRAIN(self, batch):

        if len(self.deck) < batch:
            return

        sample_indx = random.sample(self.deck, batch)
        time_all = {}

        self._train_critic(sample_indx)
        self._train_actor(sample_indx)
        self.update_target()
        self.ep_rewards= []
                     
        if self.e >= self.e_:
            self.e *= self.dc
            


if __name__ == '__main__':
    
    env = gym.make('BipedalWalker-v2')
    env = env.unwrapped
    nx = env.observation_space.shape  
    ny = env.action_space.shape 
    sess = tf.Session()
    K.set_session(sess)
    agent = Actor_Critic(nx,ny, "BipedalWalker_model.h5", sess, 15)
        
    for i in range(3): 
        observation = env.reset()         
        observation = observation.reshape(1,-1)                
        counter=0
        while counter<10000:            
            counter=counter+1
            env.render()
            action = agent.choose_action(observation)
            action = action.reshape((4,))
            observation_new, reward, flag, inf = env.step(np.clip(action,-1,1))
            observation_new = observation_new.reshape((1,24))                    
            agent.storing(observation, action, reward, observation_new, flag)   
            observation = observation_new         

    
        
            

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.




Instructions for updating:
Use tf.cast instead.


