In [1]:
# !pip install gym

In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
np.random.seed(17)
tf.random.set_seed(17)

In [4]:
env = gym.make('CartPole-v1')
# env.action_space.seed(17)
# env.observation_space.seed(17)

In [5]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [6]:
env.action_space

Discrete(2)

In [12]:
num_inputs = 4
num_actions = 2

num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation='relu')(inputs)
action = layers.Dense(num_actions, activation='softmax')(common)
critic = layers.Dense(1)(common)
                      
model = keras.Model(inputs=inputs, outputs=[action, critic])

In [8]:
optimizer = keras.optimizers.legacy.Adam(learning_rate=0.01)

loss = keras.losses.Huber()

In [39]:
state = env.reset()
state = tf.convert_to_tensor(state[0])
state = tf.expand_dims(state, 0)  
action_probs, critic_value = model(state)
print(action_probs, critic_value)

action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action

tf.Tensor([[0.49940127 0.5005987 ]], shape=(1, 2), dtype=float32) tf.Tensor([[0.00775559]], shape=(1, 1), dtype=float32)


0

In [11]:
gamma = 0.99

max_steps_per_episode = 10000

eps = np.finfo(np.float32).eps.item()

action_probs_history = []
critic_value_history = []
rewards_history = []

running_reward = 0 
episode_count = 0

while True:
    state = env.reset()
    episode_reward = 0
    
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            env.render()
            
            state = tf.convert_to_tensor(state[0])
            state = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state)
            
            critic_value_history.append(critic_value[0, 0])

            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            # action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            
            action_probs_history.append(tf.math.log(action_probs[0, action]))
                                        
            state, reward, done, _, _ = env.step(action)
                                        
            rewards_history.append(reward)
                                        
            episode_reward += reward
                                        
            if done:
                break
            
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        
        returns = []
                                        
        discounted_sum = 0
        
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)
        
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
                                        
        history = zip(action_probs_history, critic_value_history, returns)
                                        
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            diff = ret - value 
            actor_losses.append(-log_prob * diff)
                                        
            critic_losses.append(
                loss(tf.expand_dims(value,0), tf.expand_dims(ret, 0))
            )
                
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
                                  
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
    
    episode_count += 1
    
    if episode_count % 10 == 0:
        print(f'Episode: {episode_count}, running reward: {running_reward:.2f}')
              
    if running_reward > 195:
        print(f'Solved at episode {episode_count}!')
        break

env.close()

ValueError: Exception encountered when calling layer 'model' (type Functional).

Input 0 of layer "dense" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (1,)

Call arguments received by layer 'model' (type Functional):
  • inputs=tf.Tensor(shape=(1,), dtype=float32)
  • training=None
  • mask=None