In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

# Define Actor Network
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# Define Critic Network
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Discounted rewards function
def discount_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    running_add = 0
    for r in reversed(rewards):
        running_add = running_add * gamma + r
        discounted_rewards.insert(0, running_add)
    return discounted_rewards

# Function to compute advantages
def compute_advantages(critic, states, rewards):
    values = critic(states)
    advantages = rewards - values
    return advantages

# Initialize the environment
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

# Initialize the actor and critic networks
actor = Actor(input_dim, output_dim)
critic = Critic(input_dim)

# Initialize the optimizers
optimizer_actor = optim.Adam(actor.parameters(), lr=0.01)
optimizer_critic = optim.Adam(critic.parameters(), lr=0.01)

# Training loop
for episode in range(1000):
    state = env.reset()
    episode_reward = 0

    states = []
    actions = []
    rewards = []

    while True:
        env.render()  # Render the environment

        states.append(torch.tensor(state, dtype=torch.float32))

        action_probs = actor(torch.tensor(state, dtype=torch.float32))
        action = torch.multinomial(action_probs, num_samples=1).item()
        actions.append(action)

        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)

        episode_reward += reward
        state = next_state

        if done:
            discounted_rewards = torch.tensor(discount_rewards(rewards), dtype=torch.float32)
            advantages = compute_advantages(critic, torch.stack(states), discounted_rewards)

            # Compute actor loss
            log_probs = torch.log(actor(torch.stack(states)))[range(len(actions)), actions]
            actor_loss = -(log_probs * advantages).mean()

            # Compute critic loss
            critic_loss = ((critic(torch.stack(states)).squeeze() - discounted_rewards)**2).mean()

            # Update actor and critic
            optimizer_actor.zero_grad()
            optimizer_critic.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            optimizer_actor.step()
            optimizer_critic.step()

            print("Episode: {}, Reward: {}".format(episode, episode_reward))
            break

env.close()


  import distutils.spawn


Episode: 0, Reward: 15.0
Episode: 1, Reward: 18.0
Episode: 2, Reward: 13.0
Episode: 3, Reward: 11.0
Episode: 4, Reward: 11.0
Episode: 5, Reward: 14.0
Episode: 6, Reward: 11.0
Episode: 7, Reward: 18.0
Episode: 8, Reward: 12.0
Episode: 9, Reward: 11.0
Episode: 10, Reward: 15.0
Episode: 11, Reward: 15.0
Episode: 12, Reward: 13.0
Episode: 13, Reward: 14.0
Episode: 14, Reward: 23.0
Episode: 15, Reward: 22.0
Episode: 16, Reward: 15.0
Episode: 17, Reward: 19.0
Episode: 18, Reward: 19.0
Episode: 19, Reward: 14.0
Episode: 20, Reward: 14.0
Episode: 21, Reward: 27.0
Episode: 22, Reward: 25.0
Episode: 23, Reward: 13.0
Episode: 24, Reward: 21.0
Episode: 25, Reward: 52.0
Episode: 26, Reward: 31.0
Episode: 27, Reward: 39.0
Episode: 28, Reward: 13.0
Episode: 29, Reward: 43.0
Episode: 30, Reward: 48.0
Episode: 31, Reward: 19.0
Episode: 32, Reward: 33.0
Episode: 33, Reward: 44.0
Episode: 34, Reward: 44.0
Episode: 35, Reward: 56.0
Episode: 36, Reward: 74.0


KeyboardInterrupt: 

: 