### Actor Critic

code from claude


In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import gym
from env.custom_hopper import *
from tqdm import tqdm

In [45]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, learning_rate=0.001):
        super(ActorCritic, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # Actor network
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        
        # Critic network
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def forward(self, state):
        action_probs = self.actor(state)
        state_value = self.critic(state)
        return action_probs, state_value
    
    def get_action(self, state):
        state = torch.FloatTensor(state)
        action_probs, _ = self.forward(state)
        action_probs = action_probs.detach().numpy()
        return np.random.choice(self.action_dim, p=action_probs)
    
    def train(self, state, action, reward, next_state, done):
        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        reward = torch.FloatTensor([reward])
        done = torch.FloatTensor([done])
        
        # Compute action probabilities and state values
        action_probs, state_value = self.forward(state)
        _, next_state_value = self.forward(next_state)
        
        # Compute advantage
        advantage = reward + (1 - done) * 0.99 * next_state_value - state_value
        
        # Compute losses
        actor_loss = -torch.log(action_probs[action]) * advantage.detach()
        critic_loss = advantage.pow(2)
        
        # Compute total loss
        loss = actor_loss + critic_loss
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [35]:
env = gym.make('CustomHopper-source-v0')

print('Action space:', env.action_space)
print('State space:', env.observation_space)
print('Dynamics parameters:', env.get_parameters())

state_dim = env.observation_space.shape[-1]
action_dim = env.action_space.shape[-1]

agent = ActorCritic(state_dim, action_dim)

seed_value = 1234
n_train_episodes = 10000

# Training loop (simplified)
for episode in tqdm(range(n_train_episodes), desc='Training'):
    state = env.reset()  # Assuming you have an environment
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state

Action space: Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)
State space: Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf inf inf inf], (11,), float64)
Dynamics parameters: [2.53429174 3.92699082 2.71433605 5.0893801 ]


Training: 100%|██████████| 10000/10000 [1:17:43<00:00,  2.14it/s]


In [39]:
# save the model
torch.save(agent.state_dict(), 'actor_critic.pth')

In [49]:
# load the model and test it
agent2 = ActorCritic(state_dim, action_dim)
agent2.load_state_dict(torch.load('actor_critic.pth'))

reward_list = []

for i in range(50):

    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = agent2.get_action(state)
        state, reward, done, _ = env.step(action)
        total_reward += reward

    reward_list.append(total_reward)
    print(f'Episode {i+1}: {total_reward}')

print(f"mean reward: {np.mean(round(reward_list))} std: {np.std(round(reward_list))}")

Episode 1: 105.57162020089108
Episode 2: 288.60376453326154
Episode 3: 139.96577539735327
Episode 4: 198.9992705415749
Episode 5: 149.65983392700852
Episode 6: 151.61105906456245
Episode 7: 127.38692081203864
Episode 8: 132.7609439894023
Episode 9: 129.59071934361077
Episode 10: 141.34431025310812
Episode 11: 131.24788704653
Episode 12: 118.98110658889483
Episode 13: 126.31634474312969
Episode 14: 155.4756898171278
Episode 15: 128.66839334716724
Episode 16: 107.34751197153531
Episode 17: 171.13049661153067
Episode 18: 162.3165250110732
Episode 19: 145.93990685139602
Episode 20: 196.28645252529003
Episode 21: 147.61658372651561
Episode 22: 120.9757093729097
Episode 23: 101.24868898359226
Episode 24: 154.77373471850325
Episode 25: 214.28294850390262
Episode 26: 251.08424553646054
Episode 27: 136.8696882444297
Episode 28: 140.55948115718684
Episode 29: 164.73178156185182
Episode 30: 91.1272921033802
Episode 31: 399.0553155307729
Episode 32: 178.838940081719
Episode 33: 176.57805721519063
