In [21]:
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Actor-Critic network
class ActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.policy_head = nn.Linear(128, output_dim)
        self.value_head = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        policy = torch.softmax(self.policy_head(x), dim=-1)
        value = self.value_head(x)
        return policy, value

# Hyperparameters
learning_rate = 0.001
num_episodes = 1000
gamma = 0.99  # Discount factor

# Initialize environment and model
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
model = ActorCritic(input_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Function to select action based on policy
def select_action(state):
    state = torch.FloatTensor(state).unsqueeze(0)  # Add batch dimension
    policy, _ = model(state)
    action = np.random.choice(output_dim, p=policy.detach().numpy()[0])  # Use the first (and only) batch
    return action

# Training loop
for episode in range(num_episodes):
    state = env.reset()[0]
    # Debugging: Print initial state
    print(f"Initial state: {state}")

    done = False
    episode_reward = 0

    while not done:
        action = select_action(state)
        next_state, reward, done, _ = env.step(action)

        # Debugging: Print state transition
        print(f"State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}")

        # Convert to tensors
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
        reward_tensor = torch.FloatTensor([reward])

        # Get value estimates
        _, value = model(state_tensor)
        _, next_value = model(next_state_tensor)

        # Calculate advantage
        advantage = reward_tensor + (1 - done) * gamma * next_value - value

        # Update policy (actor)
        policy, _ = model(state_tensor)
        policy_loss = -torch.log(policy[0, action]) * advantage.detach()  # Use first batch item
        
        # Update value function (critic)
        value_loss = advantage.pow(2)

        # Total loss
        optimizer.zero_grad()
        (policy_loss + value_loss).backward()
        optimizer.step()

        state = next_state
        episode_reward += reward

    if episode % 100 == 0:
        print(f'Episode {episode}, Reward: {episode_reward}')

env.close()


Initial state: [ 0.0339111   0.00154158 -0.04285301  0.01215115]


ValueError: too many values to unpack (expected 4)

In [17]:
state


(array([ 0.03727205,  0.00309066, -0.02891434, -0.03104826], dtype=float32),
 {})