In [1]:
from actorcritic import ActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np



In [2]:

config = {
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 128, 128, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.005,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 90.0,
    'max_episodes': 1000,
    'max_steps': 500,
    'update_frequency': 250

}

In [3]:
class MCCWrapper(EnvironmentWrapper):
    def __init__(self, env, target_state_size=6, target_action_size=3, optimize_action=True, encourage=False):
        super().__init__(env, target_state_size, target_action_size)
        self.optimize_action = optimize_action
        self.encourage = encourage
        self.last_action = 0

    def step(self, action):
        # Assuming the first element of the padded action vector is the intended action for continuous spaces
        action = np.array([action])
        state, reward, done, _, info = self.env.step(action)
        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))

        if self.encourage:      
            position = state[0]
            velocity = state[1]
            if position > 0.05:
                reward += 50 # Encourage both position and velocity

                if self.last_action * action[0] > 0:
                    reward += 0.1

        if not self.optimize_action:
            # Remove action penalty during initial episodes
            action_penalty = 0.01 * action[0]**2
            reward += action_penalty  # Add back the subtracted penalty to neutralize it

        return padded_state, reward, done, info

    def action_padding(self, action):
        # For continuous actions, it's already a float, pad to match target_action_size

        padded_action = np.zeros(self.target_action_size)
        padded_action[0] = action
        return padded_action

In [None]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, optimize_action=False, encourage=True)

# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [5]:
results = agent.train(env_wrapper, max_episodes=1000, max_steps=200, reward_threshold=200, update_frequency=124)

Episode 0, Avg Reward: -25.469999999999974, PLoss: -18.23121452331543, VLoss: 4.76250696182251
Episode 10, Avg Reward: -27.34363636363633, PLoss: -14.18409252166748, VLoss: 3.2114710807800293
Episode 20, Avg Reward: -23.575714285714266, PLoss: -5.386652946472168, VLoss: 1.8356647491455078
Episode 30, Avg Reward: -18.34838709677418, PLoss: -3.932079553604126, VLoss: 0.24220699071884155
Episode 40, Avg Reward: -14.015853658536574, PLoss: -0.4094262719154358, VLoss: 0.011798807419836521
Episode 50, Avg Reward: -11.272941176470578, PLoss: 0.00010030555131379515, VLoss: 0.0003406023024581373
Episode 60, Avg Reward: -9.424918032786877, PLoss: 5.211324605625123e-06, VLoss: 1.3916159332438838e-05
Episode 70, Avg Reward: -8.097464788732387, PLoss: -1.2873040304839378e-06, VLoss: 2.8735000796586974e-06
Episode 80, Avg Reward: -7.097777777777772, PLoss: 5.855249582964461e-07, VLoss: 9.845788326856564e-07
Episode 90, Avg Reward: -6.317802197802193, PLoss: -2.3283814698515926e-07, VLoss: 1.96858579

In [None]:
env_wrapper = MCCWrapper(env, optimize_action=False, encourage=False)
results = agent.train(env_wrapper, max_episodes=1000, max_steps=200, reward_threshold=200, update_frequency=124)

Episode 0, Avg Reward: 0.0, PLoss: -8.791076200331815e-10, VLoss: 3.3721200054515066e-08
Episode 10, Avg Reward: 0.0, PLoss: -3.956101224389386e-09, VLoss: 6.689766962608701e-08


KeyboardInterrupt: 

In [None]:
env_wrapper = MCCWrapper(env, optimize_action=False, encourage=False)
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: 0.0, PLoss: -0.00014008763537276536, VLoss: 39.984580993652344
Episode 10, Avg Reward: 0.0, PLoss: -1.1904383256933215e-07, VLoss: 3.04175446217414e-05


KeyboardInterrupt: 

In [None]:
# Save the models
agent.save_models()

In [None]:
# save results with model name and time
import datetime
import numpy as np
now = datetime.datetime.now()
np.save(f'results/{config["env_name"]}_{now.strftime("%Y-%m-%d_%H-%M")}.csv', results)