In [None]:
import gym
import numpy as np
import tensorflow as tf

# Define the policy network
inputs = tf.keras.layers.Input(shape=(4,))
dense = tf.keras.layers.Dense(16, activation='relu')(inputs)
outputs = tf.keras.layers.Dense(2, activation='softmax')(dense)
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(lr=0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Define the environment
env = gym.make('CartPole-v1')

# Define the training loop
num_episodes = 1000
discount_factor = 0.99
for i in range(num_episodes):
    # Reset the environment for each episode
    state = env.reset()
    states, actions, rewards = [], [], []
    done = False
    
    # Run the episode until termination
    while not done:
        # Get the action probabilities from the policy network
        logits = model(np.array([state]))
        action_probs = tf.nn.softmax(logits).numpy()[0]
        
        # Sample an action from the action probabilities
        action = np.random.choice(env.action_space.n, p=action_probs)
        
        # Take the chosen action and observe the reward and next state
        next_state, reward, done, _ = env.step(action)
        
        # Record the state, action, and reward
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        
        # Update the state for the next iteration
        state = next_state
        
    # Compute the discounted rewards
    discounted_rewards = []
    running_sum = 0
    for r in reversed(rewards):
        running_sum = r + discount_factor * running_sum
        discounted_rewards.append(running_sum)
    discounted_rewards.reverse()
    discounted_rewards = np.array(discounted_rewards)
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-10)
    
    # Compute the loss and update the policy network
    with tf.GradientTape() as tape:
        logits = model(np.array(states))
        loss = -tf.reduce_mean(tf.math.log(tf.reduce_sum(logits * tf.one_hot(actions, depth=2), axis=1)) * discounted_rewards)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    # Print the episode score
    score = sum(rewards)
    print(f"Episode {i+1}: Score = {score}")

  deprecation(
  deprecation(


Episode 1: Score = 20.0
Episode 2: Score = 23.0
Episode 3: Score = 18.0
Episode 4: Score = 12.0
Episode 5: Score = 12.0
Episode 6: Score = 22.0
Episode 7: Score = 18.0
Episode 8: Score = 12.0
Episode 9: Score = 41.0
Episode 10: Score = 16.0
Episode 11: Score = 19.0
Episode 12: Score = 33.0
Episode 13: Score = 9.0
Episode 14: Score = 33.0
Episode 15: Score = 37.0
Episode 16: Score = 17.0
Episode 17: Score = 30.0
Episode 18: Score = 38.0
Episode 19: Score = 81.0
Episode 20: Score = 13.0
Episode 21: Score = 13.0
Episode 22: Score = 28.0
Episode 23: Score = 25.0
Episode 24: Score = 46.0
Episode 25: Score = 12.0
Episode 26: Score = 65.0
Episode 27: Score = 27.0
Episode 28: Score = 31.0
Episode 29: Score = 27.0
Episode 30: Score = 41.0
Episode 31: Score = 17.0
Episode 32: Score = 9.0
Episode 33: Score = 10.0
Episode 34: Score = 15.0
Episode 35: Score = 11.0
Episode 36: Score = 11.0
Episode 37: Score = 22.0
Episode 38: Score = 19.0
Episode 39: Score = 10.0
Episode 40: Score = 24.0
Episode 41: