In [86]:
from actorcritic import ActorCriticAgent, ContinuousActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np



In [92]:

config = {
    'experiment': 'MountainCarContinuous',
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.0005,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 80.0,
    'max_episodes': 10000,
    'max_steps': 500,
    'update_frequency': 500,
    'discrete': True

}

In [97]:
class MCCWrapper(EnvironmentWrapper):
    def __init__(self, env, num_actions=5):
        super().__init__(env)
        self.action_space = num_actions
        # Define the discrete action boundaries
        self.action_boundaries = np.linspace(-1, 1, num_actions)
        self.ticker = 100000


    def step(self, action):
        # Convert the discrete action into a continuous action
        continuous_action = [self.discretize_action(action)]


        # Step using the continuous action
        state, reward, done, _, info = self.env.step(continuous_action)

        reward = 0
        if state[0] >= 0.45:
            print('success')
            reward += 1

        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        return padded_state, reward, done, info

    def discretize_action(self, action):
        # Ensure the action is within the valid range
        action = max(0, min(action, self.action_space - 1))
        return self.action_boundaries[action]

In [98]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, num_actions=2)


In [99]:
# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [100]:
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: 0.0, PLoss: -0.02809300273656845, VLoss: 2.346990368096158e-05
success
success
Episode 10, Avg Reward: 0.18181818181818182, PLoss: -0.13501030206680298, VLoss: 4.6662262320751324e-05
success
Episode 20, Avg Reward: 0.14285714285714285, PLoss: 0.006202287040650845, VLoss: 0.0008403310785070062
success
success
Episode 30, Avg Reward: 0.16129032258064516, PLoss: 0.8085190653800964, VLoss: 0.7923123240470886
Episode 40, Avg Reward: 0.12195121951219512, PLoss: -0.0646931454539299, VLoss: 0.0007449614349752665
success
success
Episode 50, Avg Reward: 0.13725490196078433, PLoss: -0.03795450180768967, VLoss: 0.001049244194291532
success
success
success
Episode 60, Avg Reward: 0.16393442622950818, PLoss: 0.8319472670555115, VLoss: 0.6701138019561768
Episode 70, Avg Reward: 0.14084507042253522, PLoss: -0.2892712652683258, VLoss: 0.002572599332779646
success
Episode 80, Avg Reward: 0.13580246913580246, PLoss: -0.010653910227119923, VLoss: 0.0015055539552122355
Episode 90, Av

In [None]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, num_actions=config['action_size'])

In [None]:
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

In [7]:
# Save the models
agent.save_models()

In [None]:
# plot results
import matplotlib.pyplot as plt

plt.plot(results['Episode'], results['Reward'], label='Reward')
plt.plot(results['Episode'], results['Average_100'], label='Average 100')
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()
plt.show()


In [8]:
np.save(f'results/{config["experiment"]}', results)