In [11]:
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense 
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam
import numpy as np

In [12]:

class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')

    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)

        pi = self.pi(value)

        return pi

In [13]:

class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
                 layer1_size=128, layer2_size=128):

        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation],dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        
        return action.numpy()[0]
    

    def store_transition(self, observation, action, reward):
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def learn(self):
        actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        rewards = tf.convert_to_tensor(self.reward_memory)
        
        G = np.zeros_like(rewards)
        for t in range(len(rewards)):
            G_sum = 0
            discount = 1
            for k in range(t, len(rewards)):
                G_sum += rewards[k]*discount
                discount *= self.gamma
                
            G[t] = G_sum
            
        with tf.GradientTape() as tape:
            loss = 0
            for idx,(g , state) in enumerate(zip(G, self.state_memory)):
                state = tf.convert_to_tensor([observation],dtype=tf.float32)
                probs = self.policy(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(actions[idx])
                loss += -g + tf.squeeze(log_prob)
                
        gradient = tape.gradient(loss, self.policy.trainable_variables)
        self.policy.optimizer.apply_gradients(zip(gradient,self.policy.trainable_variables))
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []



In [14]:
# !pip install mujoco_py==2.0.2.8
# !pip install 'gym[all]'
import gym

In [15]:
agent = Agent(alpha=0.001,  gamma=0.99,n_actions=2)
env = gym.make('CartPole-v0')
score_history = []

In [16]:
if __name__ == '__main__':
    
    num_episodes = 350

    for i in range(num_episodes):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            agent.store_transition(observation, action, reward)
            observation = observation_
            score += reward
        score_history.append(score)

        agent.learn()
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % score,
            'average score %.1f' % avg_score)

episode:  0 score: 16.0 average score 16.0
episode:  1 score: 21.0 average score 18.5
episode:  2 score: 34.0 average score 23.7
episode:  3 score: 17.0 average score 22.0
episode:  4 score: 30.0 average score 23.6
episode:  5 score: 9.0 average score 21.2
episode:  6 score: 17.0 average score 20.6
episode:  7 score: 84.0 average score 28.5
episode:  8 score: 44.0 average score 30.2
episode:  9 score: 96.0 average score 36.8
episode:  10 score: 69.0 average score 39.7
episode:  11 score: 23.0 average score 38.3
episode:  12 score: 51.0 average score 39.3
episode:  13 score: 15.0 average score 37.6
episode:  14 score: 28.0 average score 36.9
episode:  15 score: 92.0 average score 40.4
episode:  16 score: 25.0 average score 39.5
episode:  17 score: 79.0 average score 41.7
episode:  18 score: 47.0 average score 41.9
episode:  19 score: 62.0 average score 43.0
episode:  20 score: 43.0 average score 43.0
episode:  21 score: 74.0 average score 44.4
episode:  22 score: 36.0 average score 44.0

episode:  185 score: 64.0 average score 62.9
episode:  186 score: 37.0 average score 61.2
episode:  187 score: 39.0 average score 60.0
episode:  188 score: 56.0 average score 59.6
episode:  189 score: 59.0 average score 59.5
episode:  190 score: 31.0 average score 57.9
episode:  191 score: 33.0 average score 56.8
episode:  192 score: 37.0 average score 56.1
episode:  193 score: 55.0 average score 55.6
episode:  194 score: 30.0 average score 55.2
episode:  195 score: 29.0 average score 54.6
episode:  196 score: 32.0 average score 54.4
episode:  197 score: 51.0 average score 53.9
episode:  198 score: 30.0 average score 53.5
episode:  199 score: 30.0 average score 53.1
episode:  200 score: 58.0 average score 53.2
episode:  201 score: 32.0 average score 53.0
episode:  202 score: 65.0 average score 53.1
episode:  203 score: 31.0 average score 52.9
episode:  204 score: 29.0 average score 52.8
episode:  205 score: 68.0 average score 52.2
episode:  206 score: 76.0 average score 52.5
episode:  