In [7]:
from actorcritic import ActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np

In [8]:
# Configurable hyperparameters
config = {
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3, 
    'hidden_sizes': [64, 128, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.005,
    'verbosity': 10,
    'env_name': 'Acrobot-v1',
    'gamma': 0.99, # Discount factor
    'reward_threshold': -85.0,
    'max_episodes': 4000,
    'max_steps': 200,
    'update_frequency': 100

}

In [9]:
# create a child class for acrobot with a updates reward
class AcrobotWrapper(EnvironmentWrapper):
    def step(self, action):
        action = min(self.action_space-1, max(0, action))

        state, reward, done, _, info = self.env.step(action)

        # Extract the two angles from the state
        cos_theta1, sin_theta1, cos_theta2, sin_theta2, _, _ = state
        # Convert cos and sin of theta1 and theta2 into their angles
        theta1 = np.arctan2(sin_theta1, cos_theta1)
        theta2 = np.arctan2(sin_theta2, cos_theta2)
        # Calculate the vertical position of the free end
        vertical_position = -np.cos(theta1) - np.cos(theta2 + theta1)
        # Define a custom reward based on the vertical position
        reward += vertical_position  # Reward based on vertical position and step
        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        return padded_state, reward, done, info

In [10]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = AcrobotWrapper(env)
# env_wrapper = AcrobotWrapper(env)

# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [None]:
# load the models
agent.load_models()


In [11]:
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: -528.5148320794106, PLoss: -312.9441223144531, VLoss: 806.9889526367188
Episode 10, Avg Reward: -575.6100189956752, PLoss: -325.0927734375, VLoss: 872.4464111328125
Episode 20, Avg Reward: -580.8178769889332, PLoss: -282.19329833984375, VLoss: 2387.364990234375
Episode 30, Avg Reward: -583.6162433182039, PLoss: 17.279329299926758, VLoss: 34166.24609375
Episode 40, Avg Reward: -584.8381251256641, PLoss: 41.27375793457031, VLoss: 6710.5556640625
Episode 50, Avg Reward: -585.8060277571865, PLoss: 145.94754028320312, VLoss: 73885.234375
Episode 60, Avg Reward: -586.2191346737205, PLoss: -124.98808288574219, VLoss: 17570.23828125
Episode 70, Avg Reward: -586.6580543895842, PLoss: -116.6540298461914, VLoss: 5765.52978515625
Episode 80, Avg Reward: -586.9050535500785, PLoss: 69.08497619628906, VLoss: 24317.173828125
Episode 90, Avg Reward: -587.1760207630775, PLoss: -84.82068634033203, VLoss: 34337.0625
Episode 100, Avg Reward: -586.9813476938009, PLoss: -117.3304672241

In [13]:
# remove the handicap
class AcrobotWrapper(EnvironmentWrapper):
    def step(self, action):
        action = min(self.action_space-1, max(0, action))

        state, reward, done, _, info = self.env.step(action)
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        return padded_state, reward, done, info

env_wrapper = AcrobotWrapper(env)

In [15]:
results = agent.train(env_wrapper, max_episodes=1000, max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: -106.0, PLoss: 0.0003699598310049623, VLoss: 704.8746948242188
Episode 10, Avg Reward: -88.72727272727273, PLoss: -0.23889432847499847, VLoss: 760.2705078125
Episode 20, Avg Reward: -86.0, PLoss: -1.5596301555633545, VLoss: 897.958984375
Episode 30, Avg Reward: -83.51612903225806, PLoss: -0.010168925859034061, VLoss: 930.47021484375
Episode 40, Avg Reward: -86.73170731707317, PLoss: -0.19047808647155762, VLoss: 1525.860595703125
Episode 50, Avg Reward: -86.80392156862744, PLoss: 1.898866891860962, VLoss: 1094.6346435546875
Episode 60, Avg Reward: -87.22950819672131, PLoss: -2.5922634601593018, VLoss: 1032.9727783203125
Episode 70, Avg Reward: -87.29577464788733, PLoss: 0.27880147099494934, VLoss: 1001.565673828125
Episode 80, Avg Reward: -86.46913580246914, PLoss: -5.871864318847656, VLoss: 678.54150390625
Episode 90, Avg Reward: -85.58241758241758, PLoss: 0.023517608642578125, VLoss: 747.7978515625
Solved at episode 99 with average reward -84.86.


In [None]:
# save results with model name and time
import datetime
now = datetime.datetime.now()
np.save(f'results/{config["env_name"]}_{now.strftime("%Y-%m-%d_%H-%M")}.csv', results)

In [6]:
# save models
agent.save_models()