In [2]:
import tensorflow.compat.v1 as tf
import gym
import numpy as np
tf.disable_eager_execution()

In [84]:
class Agent:
    def __init__(self, learning_rate):
        # Build the network to predict the correct action
        tf.reset_default_graph()
        input_dimension = 4
        hidden_dimension = 32
        self.input = tf.placeholder(dtype=tf.float32, shape=[1, input_dimension], name='X')
        hidden_layer = tf.layers.dense(self.input, hidden_dimension, kernel_initializer=tf.initializers.random_normal())
        logits = tf.layers.dense(hidden_layer, 2, kernel_initializer=tf.initializers.random_normal())

        # Sample an action according to network's output
        # use tf.multinomial and sample one action from network's output
        self.action = tf.random.categorical(logits, 1)

        # Optimization according to policy gradient algorithm
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(tf.one_hot(self.action, 2), logits)
        self.optimizer = tf.train.AdamOptimizer(learning_rate)  # use one of tensorflow optimizers
        grads_vars = self.optimizer.compute_gradients(cross_entropy)  # gradient of current action w.r.t. network's variables
        self.gradients = [grad for grad, var in grads_vars]

        # get rewards from the environment and evaluate rewarded gradients
        #  and feed it to agent and then call train operation
        self.rewarded_grads_placeholders_list = []
        rewarded_grads_and_vars = []
        for grad, var in grads_vars:
            rewarded_grad_placeholder = tf.placeholder(dtype=tf.float32, shape=grad.shape)
            self.rewarded_grads_placeholders_list.append(rewarded_grad_placeholder)
            rewarded_grads_and_vars.append((rewarded_grad_placeholder, var))

        self.train_operation = self.optimizer.apply_gradients(rewarded_grads_and_vars)

        self.saver = tf.train.Saver()

        config = tf.ConfigProto(
            device_count={'GPU': 0},
            gpu_options=tf.GPUOptions(allow_growth=True)
        )

        self.ses = tf.Session(config=config)
        self.ses.run(tf.global_variables_initializer())

    def get_action_and_gradients(self, obs):
        # compute network's action and gradients given the observations
        return self.ses.run([self.action, self.gradients], feed_dict={self.input: obs})

    def train(self, rewarded_gradients):
#         feed_dict = {(self.rewarded_grads_placeholders_list[i], rewarded_gradients[i]) for i in range(len(rewarded_gradients))}
        # feed gradients into the placeholder and call train operation
        feed_dict = {}
        for i in range(len(rewarded_gradients)):
            feed_dict[self.rewarded_grads_placeholders_list[i]] = rewarded_gradients[i]
        self.ses.run([self.train_operation], feed_dict=feed_dict)

    def save(self):
        self.saver.save(self.ses, "SavedModel/")

    def load(self):
        self.saver.restore(self.ses, "SavedModel/")

In [85]:
epochs = 100
max_steps_per_game = 1000
games_per_epoch = 128
discount_factor = 0.99
learning_rate = 0.01

def discount_function(factor, n):
    return (1 - factor**n)/(1 - factor)

agent = Agent(learning_rate)
game = gym.make("CartPole-v0").env
for epoch in range(epochs):
    epoch_rewards = []
    epoch_gradients = []
    epoch_average_reward = 0
    for episode in range(games_per_epoch):
        obs = game.reset()
        step = 0
        single_episode_rewards = []
        single_episode_gradients = []
        game_over = False
        while not game_over and step < max_steps_per_game:
            step += 1
#             image = game.render(mode='rgb_array') # Call this to render game and show visual
            action, gradients = agent.get_action_and_gradients(obs.reshape([1,4]))
            obs, reward, game_over, info = game.step(action[0,0])
            single_episode_rewards.append(reward)
            single_episode_gradients.append(gradients)

        epoch_rewards.append(single_episode_rewards)
        epoch_gradients.append(single_episode_gradients)
        epoch_average_reward += sum(single_episode_rewards)

    epoch_average_reward /= games_per_epoch
    print("Epoch = {}, , Average reward = {}".format(epoch, epoch_average_reward))
    
    normalized_rewards = [[(discount_function(discount_factor, len(epoch_rewards[j])-i)-epoch_average_reward) for i in range(len(epoch_rewards[j]))] for j in range(len(epoch_rewards))]
    mean_rewarded_gradients = [np.zeros(agent.gradients[i].shape) for i in range(4)]
    for i in range(games_per_epoch):
        for j in range(len(epoch_gradients[i])):
            for k in range(4):
                mean_rewarded_gradients[k] += (normalized_rewards[i][j] * epoch_gradients[i][j][k])/games_per_epoch
#     print(mean_rewarded_gradients[1])
#     break
    agent.train(mean_rewarded_gradients)
    if epoch_average_reward > 900:
        break
agent.save()
game.close()


Epoch = 0, , Average reward = 45.65625
Epoch = 1, , Average reward = 48.1328125
Epoch = 2, , Average reward = 48.9375
Epoch = 3, , Average reward = 48.6640625
Epoch = 4, , Average reward = 51.5390625
Epoch = 5, , Average reward = 53.4140625
Epoch = 6, , Average reward = 52.375
Epoch = 7, , Average reward = 57.5390625
Epoch = 8, , Average reward = 56.03125
Epoch = 9, , Average reward = 59.2421875
Epoch = 10, , Average reward = 62.4296875
Epoch = 11, , Average reward = 64.546875
Epoch = 12, , Average reward = 64.546875
Epoch = 13, , Average reward = 66.7109375
Epoch = 14, , Average reward = 68.578125
Epoch = 15, , Average reward = 77.7578125
Epoch = 16, , Average reward = 75.7421875
Epoch = 17, , Average reward = 85.578125
Epoch = 18, , Average reward = 88.9375
Epoch = 19, , Average reward = 100.2109375
Epoch = 20, , Average reward = 109.4609375
Epoch = 21, , Average reward = 124.8046875
Epoch = 22, , Average reward = 145.921875
Epoch = 23, , Average reward = 192.640625
Epoch = 24, , Ave

In [86]:
# Run this part after training the network
game = gym.make("CartPole-v0").env
agent.load()
score = 0
for i in range(10):
    obs = game.reset()
    game_over = False
    while not game_over:
        score += 1
        image = game.render(mode='rgb_array')  # Call this to render game and show visual
        action, _ = agent.get_action_and_gradients(obs.reshape(-1, 4))
        obs, reward, game_over, info = game.step(action[0,0])
    # print(score)

print("Average Score = ", score / 10)


INFO:tensorflow:Restoring parameters from SavedModel/
Average Score =  136798.5
