In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define the CartPole-v1 environment
env = gym.make('CartPole-v1')

# Define the parameters
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
learning_rate = 0.01
gamma = 0.99
num_episodes = 1000
max_steps_per_episode = 1000

# Define the policy network
policy_network = tf.keras.Sequential([
    Dense(24, activation='relu', input_shape=(num_states,)),
    Dense(24, activation='relu'),
    Dense(num_actions, activation='softmax')
])

# Define the optimizer
optimizer = Adam(learning_rate)

# Define a function to compute the discounted rewards
def compute_discounted_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    cumulative_rewards = 0
    for i in reversed(range(len(rewards))):
        cumulative_rewards = cumulative_rewards * gamma + rewards[i]
        discounted_rewards[i] = cumulative_rewards
    return discounted_rewards

# Define the training loop
for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0
    states = []
    actions = []
    rewards = []

    for step in range(max_steps_per_episode):
        # Render the environment
        env.render()

        # Choose an action based on the policy
        action_probs = policy_network.predict(state.reshape(1, -1)).flatten()
        action = np.random.choice(num_actions, p=action_probs)

        # Take the chosen action
        next_state, reward, done, _ = env.step(action)

        # Record the trajectory
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        episode_reward += reward

        if done:
            break

        state = next_state

    # Compute the discounted rewards
    discounted_rewards = compute_discounted_rewards(rewards)

    # Compute the policy gradient
    with tf.GradientTape() as tape:
        loss = 0
        for i in range(len(states)):
            state = states[i]
            action = actions[i]
            discounted_reward = discounted_rewards[i]

            # Calculate the log probability of the selected action
            action_probs = policy_network(state.reshape(1, -1))
            selected_action_prob = action_probs[0, action]
            log_action_prob = tf.math.log(selected_action_prob)

            # Compute the policy gradient
            loss -= log_action_prob * discounted_reward

    # Apply the policy gradient
    grads = tape.gradient(loss, policy_network.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))

    # Print episode statistics
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

# Close the environment after training
env.close()


2024-03-24 19:40:37.393590: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-24 19:40:37.427122: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-24 19:40:39.933818: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-24 19:40:39.957229: I tensorflow/comp

AttributeError: 'tuple' object has no attribute 'reshape'