In [1]:
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

def compute_discounted_rewards(rewards, gamma):
    discounted_rewards = []
    cumulative_reward = 0
    for reward in reversed(rewards):
        cumulative_reward = reward + gamma * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)
    return discounted_rewards

def main():
    env = gym.make("CartPole-v1")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    policy_net = PolicyNetwork(state_dim, action_dim)
    optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

    num_episodes = 1000
    gamma = 0.99  

    for episode in range(num_episodes):
        state = env.reset()
        states, actions, rewards = [], [], []
        
        done = False
        while not done:
            state = torch.tensor(state, dtype=torch.float32)
            probs = policy_net(state)
            action = torch.multinomial(probs, 1).item()

            next_state, reward, done, _ = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)

            state = next_state
            
        discounted_rewards = compute_discounted_rewards(rewards, gamma)
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)

        
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)

        optimizer.zero_grad()
        loss = 0
        for log_prob, reward in zip(states, discounted_rewards):
            action_prob = policy_net(log_prob)
            action_log_prob = torch.log(action_prob[actions.pop(0)])
            loss -= action_log_prob * reward

        loss.backward()
        optimizer.step()

        total_reward = sum(rewards)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

        if total_reward >= 200:
            print("Solved the environment!")
            break

    env.close()

if __name__ == "__main__":
    main()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: Total Reward = 14.0
Episode 2: Total Reward = 28.0
Episode 3: Total Reward = 40.0
Episode 4: Total Reward = 31.0
Episode 5: Total Reward = 104.0
Episode 6: Total Reward = 17.0
Episode 7: Total Reward = 31.0
Episode 8: Total Reward = 30.0
Episode 9: Total Reward = 55.0
Episode 10: Total Reward = 18.0
Episode 11: Total Reward = 76.0
Episode 12: Total Reward = 29.0
Episode 13: Total Reward = 32.0
Episode 14: Total Reward = 128.0
Episode 15: Total Reward = 133.0
Episode 16: Total Reward = 82.0
Episode 17: Total Reward = 84.0
Episode 18: Total Reward = 72.0
Episode 19: Total Reward = 26.0
Episode 20: Total Reward = 42.0
Episode 21: Total Reward = 78.0
Episode 22: Total Reward = 98.0
Episode 23: Total Reward = 143.0
Episode 24: Total Reward = 121.0
Episode 25: Total Reward = 66.0
Episode 26: Total Reward = 97.0
Episode 27: Total Reward = 85.0
Episode 28: Total Reward = 91.0
Episode 29: Total Reward = 74.0
Episode 30: Total Reward = 55.0
Episode 31: Total Reward = 157.0
Episode 32: