In [1]:
from actorcritic import ActorCriticAgent, ContinuousActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np



In [2]:

config = {
    'experiment': 'MountainCarContinuous',
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 128, 64], 
    'lr_actor': 0.1,
    'lr_critic': 0.1,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 80.0,
    'max_episodes': 1000,
    'max_steps': 500,
    'update_frequency': 250,
    'discrete': False

}

In [18]:
class MCCWrapper(EnvironmentWrapper):
    def step(self, action):
        # Assuming the first element of the padded action vector is the intended action for continuous spaces
        action = action[0]
        state, reward, done, _, info = self.env.step(action)
        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))

        # reward position and ignore penalty
        position = state[0]
        velocity = state[1]

        reward = velocity * 100
        
        if position > 0.45:
            reward += 100

        return padded_state, reward, done, info

    def action_padding(self, action):
        # For continuous actions, it's already a float, pad to match target_action_size

        padded_action = np.zeros(self.target_action_size)
        padded_action[0] = action
        return padded_action

In [19]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env)

# Initialize the ActorCriticAgent
agent = ContinuousActorCriticAgent(config)

In [20]:
results = agent.train(env_wrapper, max_episodes=1000, max_steps=200, reward_threshold=200, update_frequency=124)

Episode 0, Avg Reward: -22.6329141954011, PLoss: -110.26020812988281, VLoss: 33.274932861328125
Episode 10, Avg Reward: -5.037211619973774, PLoss: -24.021568298339844, VLoss: 46.67372512817383
Episode 20, Avg Reward: -5.088498018106894, PLoss: -7.100100994110107, VLoss: 18.78367042541504
Episode 30, Avg Reward: -2.9855371195729443, PLoss: -5.438972473144531, VLoss: 15.050861358642578
Episode 40, Avg Reward: -5.486980249717849, PLoss: -58.036354064941406, VLoss: 117.58099365234375
Episode 50, Avg Reward: -5.5531887732914464, PLoss: -62.6229133605957, VLoss: 32.085693359375
Episode 60, Avg Reward: -4.4076327996115, PLoss: 89.00153350830078, VLoss: 68.31360626220703
Episode 70, Avg Reward: -3.832739070852888, PLoss: 37.867523193359375, VLoss: 40.86731719970703
Episode 80, Avg Reward: -3.4252720820430436, PLoss: 62.441566467285156, VLoss: 37.60742950439453
Episode 90, Avg Reward: -2.972304682757764, PLoss: 38.61164474487305, VLoss: 6.947000503540039
Episode 100, Avg Reward: -1.747556559440

In [21]:
class MCCWrapper(EnvironmentWrapper):
    def step(self, action):
        # Assuming the first element of the padded action vector is the intended action for continuous spaces
        action = action[0]
        state, reward, done, _, info = self.env.step(action)
        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))

        return padded_state, reward, done, info

    def action_padding(self, action):
        # For continuous actions, it's already a float, pad to match target_action_size

        padded_action = np.zeros(self.target_action_size)
        padded_action[0] = action
        return padded_action

In [22]:
env_wrapper = MCCWrapper(env)
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: -52.755937436186905, PLoss: -85.64596557617188, VLoss: 6.484879016876221
Episode 10, Avg Reward: -51.03031835682229, PLoss: -82.35324096679688, VLoss: 5.460649013519287
Episode 20, Avg Reward: -50.628964224693036, PLoss: -92.96109008789062, VLoss: 6.803727626800537
Episode 30, Avg Reward: -50.16470561445307, PLoss: -80.71420288085938, VLoss: 5.9874653816223145
Episode 40, Avg Reward: -50.24910531623818, PLoss: -52.20458984375, VLoss: 4.2186174392700195
Episode 50, Avg Reward: -50.3915127874746, PLoss: -65.89071655273438, VLoss: 5.160287857055664
Episode 60, Avg Reward: -49.99458309489909, PLoss: -34.0566291809082, VLoss: 4.391839981079102
Episode 70, Avg Reward: -50.03684971205408, PLoss: -81.07273864746094, VLoss: 9.297908782958984
Episode 80, Avg Reward: -50.153439642885985, PLoss: -53.1116828918457, VLoss: 6.417478561401367
Episode 90, Avg Reward: -50.13915479508059, PLoss: -24.704288482666016, VLoss: 4.856125354766846
Episode 100, Avg Reward: -50.037648243155

In [7]:
# Save the models
agent.save_models()

In [8]:
# save results with model name and time
import datetime
import numpy as np
now = datetime.datetime.now()
np.save(f'results/{config["env_name"]}_{now.strftime("%Y-%m-%d_%H-%M")}.csv', results)