In [1]:
# if you have more than 1 gpu, use device '0' or '1' to assign to a gpu
# import os
# os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow as tf
# from networks import PolicyGradientNetwork
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense 
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam

In [2]:
class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')

    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)

        pi = self.pi(value)

        return pi

In [10]:
class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
                 layer1_size=256, layer2_size=256):

        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
#         print(action.numpy()[0])
        return action.numpy()[0]

    def store_transition(self, observation, action, reward):
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def learn(self):
        actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        rewards = np.array(self.reward_memory)

        G = np.zeros_like(rewards)
        for t in range(len(rewards)):
            G_sum = 0
            discount = 1
            for k in range(t, len(rewards)):
                G_sum += rewards[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        
        with tf.GradientTape() as tape:
            loss = 0
            for idx, (g, state) in enumerate(zip(G, self.state_memory)):
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                probs = self.policy(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(actions[idx])
                loss += -g * tf.squeeze(log_prob)

        gradient = tape.gradient(loss, self.policy.trainable_variables)
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

In [11]:
if __name__ == '__main__':
    agent = Agent(alpha=0.0005,  gamma=0.99, n_actions=2)

    env = gym.make('CartPole-v1')
    score_history = []

    num_episodes = 2000

    for i in range(num_episodes):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            agent.store_transition(observation, action, reward)
            observation = observation_
            score += reward
            env.render()
        score_history.append(score)

        agent.learn()
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % score,
            'average score %.1f' % avg_score)

episode:  0 score: 34.0 average score 34.0
episode:  1 score: 24.0 average score 29.0
episode:  2 score: 26.0 average score 28.0
episode:  3 score: 10.0 average score 23.5
episode:  4 score: 17.0 average score 22.2
episode:  5 score: 16.0 average score 21.2
episode:  6 score: 11.0 average score 19.7
episode:  7 score: 13.0 average score 18.9
episode:  8 score: 15.0 average score 18.4
episode:  9 score: 18.0 average score 18.4
episode:  10 score: 28.0 average score 19.3
episode:  11 score: 18.0 average score 19.2
episode:  12 score: 32.0 average score 20.2
episode:  13 score: 20.0 average score 20.1
episode:  14 score: 33.0 average score 21.0
episode:  15 score: 23.0 average score 21.1
episode:  16 score: 23.0 average score 21.2
episode:  17 score: 26.0 average score 21.5
episode:  18 score: 16.0 average score 21.2
episode:  19 score: 32.0 average score 21.8
episode:  20 score: 16.0 average score 21.5
episode:  21 score: 56.0 average score 23.0
episode:  22 score: 19.0 average score 22.

KeyboardInterrupt: 