In [14]:
import gym
import numpy as np
import tensorflow as tf
import time
from IPython.display import clear_output
import random
from gym.envs.registration import  register

In [15]:
env = gym.make("CartPole-v1")

In [16]:
env_name = "CartPole-v1"
env = gym.make(env_name)
# env.observation_space.contains
print("The action space is :",env.action_space)
print("The state space is :", env.observation_space)

The action space is : Discrete(2)
The state space is : Box(4,)


In [17]:
class Agent(object):
    def __init__(self,env):
        self.is_discreet_action = type(env.action_space) == gym.spaces.discrete.Discrete  
        
        if (self.is_discreet_action):
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_range_low = env.action_space.low
            self.action_range_high = env.action_space.high
            self.action_shape = env.action_space.shape     # Discreet objects have an empty shape 
    
    def get_action(self,state):
        if self.is_discreet_action:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_range_low,self.action_range_high,
                                        self.action_shape)
        return action
    
    def create_grads_vars_feed(self):
        self.gradients = []
        self.grads_and_vars_feed = []
        self.gradient_placeholders = []
        self.tf_gradient_placeholder = None
        
        tf_gradients_and_variables = self.optimizer.compute_gradients(self.cross_entropy)
        
        for gradient, variable in tf_gradients_and_variables:
            self.gradients.append(gradient)
            self.tf_gradient_placeholder = tf.placeholder(dtype= tf.float32, shape = gradient.get_shape())
            self.gradient_placeholders.append(self.tf_gradient_placeholder)
        
            self.grads_and_vars_feed.append((self.tf_gradient_placeholder,variable))
    
    def helper_discount_rewards(self,rewards,discount_rate):
        '''
        Takes in rewards and applies discount rate
        '''
        discounted_rewards = np.zeros(len(rewards))
        cumulative_rewards = 0
        for step in reversed(range(len(rewards))):
            cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
            discounted_rewards[step] = cumulative_rewards
        return discounted_rewards
    
    
    def discount_and_normalize_rewards(self, all_rewards, discount_rate):
        '''
        Takes in all rewards, applies helper_discount function and then normalizes
        using mean and std.
        '''
        all_discounted_rewards = []
        for rewards in all_rewards:
            all_discounted_rewards.append(self.helper_discount_rewards(rewards,discount_rate))
        flat_rewards = np.concatenate(all_discounted_rewards)
        reward_mean = flat_rewards.mean()
        reward_std = flat_rewards.std()
        return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
    
    
            

In [18]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
#         self.state_size = env.observation_space.n
#         print("State size:", self.state_size)
        
        self.num_game_rounds = 10
        self.max_game_steps = 1000
        self.num_iterations = 250
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.build_model()
        
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver  = tf.train.Saver()
        
    def build_model(self):
        self.num_inputs = 4
        self.num_hidden = 4
        self.num_outputs = 1
        
        tf.reset_default_graph()
        self.initializer = tf.contrib.layers.variance_scaling_initializer()
        
        ### STATE Placeholders ####
#         self.state_in = tf.placeholder(tf.int32, shape=[1])
        self.state_in_tf = tf.placeholder(tf.float32, shape = [None,self.num_inputs])
            
        ########################################
        ### NETWORK ####
        ######################################
#         self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
#         self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        hidden_layer = tf.layers.dense(self.state_in_tf, self.num_hidden,activation= tf.nn.elu,kernel_initializer= self.initializer)
        logits = tf.layers.dense(hidden_layer,self.num_outputs)
        outputs = tf.nn.sigmoid(logits)
        
        ### ACTIONS and ACTION_LABELS####
        probabilties = tf.concat(axis=1, values=[outputs, 1 - outputs])
        self.action_tf = tf.multinomial( probabilties, num_samples=1)
        y= 1. - tf.to_float(self.action_tf)
        
        ########################################
        ### LOSS FUNCTION AND OPTIMIZATION ####
        ######################################    
#         self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
#         self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)       
        self.cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y,logits= logits)
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        
        # Shifted this here from TRAIN method because initializer runs after build_model, and all TF variables need to be initialized.
        self.create_grads_vars_feed()
        self.training_op = self.optimizer.apply_gradients(self.grads_and_vars_feed)
    
    
    def get_action(self, state):
#         q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
#         action_greedy = np.argmax(q_state)
        action_greedy, gradients_val = self.sess.run([self.action_tf, self.gradients], feed_dict={self.state_in_tf: state.reshape(1, self.num_inputs)})
        action_random = super().get_action(state)
        return (action_random if random.random() < self.eps else action_greedy[0][0],gradients_val)
    
    def train(self,total_rewards,total_gradients):
        feed = {}
        self.training_op = self.optimizer.apply_gradients(self.grads_and_vars_feed)
#         state, action, next_state, reward, done = ([exp] for exp in experience)
#         q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
#         q_next[done] = np.zeros([self.action_size])
#         q_target = reward + self.discount_rate * np.max(q_next)
        
        for var_index, gradient_placeholder in enumerate(self.gradient_placeholders):                         # The placeholders are so far empty.
            print ("var_index in gradient_placeholders: {}, gradients ".format(var_index))
            mean_gradients = np.mean([reward * total_gradients[game_index][step][var_index]
                                      for game_index, rewards in enumerate(total_rewards)
                                          for step, reward in enumerate(rewards)], axis=0)
            feed[gradient_placeholder] = mean_gradients        
        print("Going to train now")
        self.sess.run(self.training_op, feed_dict=feed)
        
    
    def saveModel(self):
        print('SAVING GRAPH AND SESSION')
        self.meta_graph_def = tf.train.export_meta_graph(filename='/models/policyGradClass.meta')
        self.saver.save(self.sess, '/models/policyGradClass')

            
    def __del__(self):
        self.sess.close()
        
agent = QNAgent(env)

Action size: 2


In [None]:
agent.create_grads_vars_feed()

for iteration in range(agent.num_iterations):
    print("Current Iteration: {} \n".format(iteration))
    
    total_rewards = []
    total_gradients = []
    
    for game in range(agent.num_game_rounds):
        game_rewards = []
        game_gradients = []
        
        
        state = env.reset()
        
        done = False
        
        for steps in range(agent.max_game_steps):
            
            action, gradients_val = agent.get_action(state)
            
            next_state, reward, done, info = env.step(action)
            
            game_rewards.append(reward)
            game_gradients.append(gradients_val)
            
#             agent.train((state,action,next_state,reward,done))
            state = next_state
#             total_reward += reward
#             print("s:", state, "a:", action)
#             print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
            env.render()
            
            if done:
                # Game Ended 
                agent.eps = agent.eps * 0.99
                break
        
        total_rewards.append(game_rewards)
        total_gradients.append(game_gradients)
        
    total_rewards = agent.discount_and_normalize_rewards(total_rewards,agent.discount_rate)
    
    agent.train(total_rewards,total_gradients)
    

agent.saveModel()
print("MODEL SAVED!!!!!")
#             with tf.variable_scope("q_table", reuse=True):
#                 weights = agent.sess.run(tf.get_variable("kernel"))
#                 print(weights)
#             time.sleep(0.05)
#             clear_output(wait=True)

Current Iteration: 0 

var_index in gradient_placeholders: 0, gradients 
var_index in gradient_placeholders: 1, gradients 
var_index in gradient_placeholders: 2, gradients 
var_index in gradient_placeholders: 3, gradients 
Going to train now
Current Iteration: 1 

var_index in gradient_placeholders: 0, gradients 
var_index in gradient_placeholders: 1, gradients 
var_index in gradient_placeholders: 2, gradients 
var_index in gradient_placeholders: 3, gradients 
Going to train now
Current Iteration: 2 

var_index in gradient_placeholders: 0, gradients 
var_index in gradient_placeholders: 1, gradients 
var_index in gradient_placeholders: 2, gradients 
var_index in gradient_placeholders: 3, gradients 
Going to train now
Current Iteration: 3 

var_index in gradient_placeholders: 0, gradients 
var_index in gradient_placeholders: 1, gradients 
var_index in gradient_placeholders: 2, gradients 
var_index in gradient_placeholders: 3, gradients 
Going to train now
Current Iteration: 4 

var_index