In [1]:
import tensorflow as tf
import numpy as np
import gym

# Define the Actor network
class ActorNetwork(tf.keras.Model):
    def __init__(self, num_actions):
        super(ActorNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(64, activation='relu')
        self.output_layer = tf.keras.layers.Dense(num_actions, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Define the Critic network
class CriticNetwork(tf.keras.Model):
    def __init__(self):
        super(CriticNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(64, activation='relu')
        self.output_layer = tf.keras.layers.Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Function to compute discounted rewards
def discount_rewards(rewards, gamma=0.99):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

# Function to compute advantages
def compute_advantages(critic, states, rewards):
    values = critic(np.array(states))
    advantages = rewards - tf.squeeze(values)
    return advantages

# Function to compute actor loss
def compute_actor_loss(actor, states, actions, advantages):
    logits = actor(np.array(states))
    action_probs = tf.nn.softmax(logits)
    action_masks = tf.one_hot(actions, depth=2, dtype=tf.float32)
    log_action_probs = tf.reduce_sum(action_masks * tf.math.log(action_probs), axis=1)
    actor_loss = -tf.reduce_mean(log_action_probs * advantages)
    return actor_loss

# Function to compute critic loss
def compute_critic_loss(critic, states, discounted_rewards):
    values = tf.squeeze(critic(np.array(states)))
    critic_loss = tf.reduce_mean(tf.square(values - discounted_rewards))
    return critic_loss

# Initialize the environment
env = gym.make('CartPole-v1')
num_actions = env.action_space.n

# Initialize the actor and critic networks
actor = ActorNetwork(num_actions)
critic = CriticNetwork()

# Initialize the optimizers
optimizer_actor = tf.keras.optimizers.Adam(learning_rate=0.01)
optimizer_critic = tf.keras.optimizers.Adam(learning_rate=0.01)

# Training loop
for episode in range(1000):
    state = env.reset()
    episode_reward = 0

    states = []
    actions = []
    rewards = []

    while True:
        env.render()  # Render the environment

        states.append(state)

        action_probs = actor(np.expand_dims(state, axis=0))
        action = np.random.choice(num_actions, p=np.squeeze(action_probs))
        actions.append(action)

        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)

        episode_reward += reward
        state = next_state

        if done:
            discounted_rewards = discount_rewards(rewards)
            advantages = compute_advantages(critic, states, discounted_rewards)

            with tf.GradientTape() as tape_actor, tf.GradientTape() as tape_critic:
                actor_loss = compute_actor_loss(actor, states, actions, advantages)
                critic_loss = compute_critic_loss(critic, states, discounted_rewards)

            gradients_actor = tape_actor.gradient(actor_loss, actor.trainable_variables)
            gradients_critic = tape_critic.gradient(critic_loss, critic.trainable_variables)

            optimizer_actor.apply_gradients(zip(gradients_actor, actor.trainable_variables))
            optimizer_critic.apply_gradients(zip(gradients_critic, critic.trainable_variables))

            print("Episode: {}, Reward: {}".format(episode, episode_reward))
            break

env.close()


2024-03-24 19:17:35.976647: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-24 19:17:36.144148: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-24 19:17:39.539531: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-24 19:17:39.586181: I tensorflow/comp

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).