In [1]:
from actorcritic import ActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np



In [2]:

config = {
    'experiment': 'MountainCarContinuous',
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.0005,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 80.0,
    'max_episodes': 5000,
    'max_steps': 500,
    'update_frequency': 500,
    'discrete': True

}

In [3]:
class MCCWrapper(EnvironmentWrapper):
    def __init__(self, env, num_actions=5):
        super().__init__(env)
        self.action_space = num_actions
        # Define the discrete action boundaries
        self.action_boundaries = np.linspace(-1, 1, num_actions)
        self.ticker = 0


    def step(self, action):
        # Convert the discrete action into a continuous action
        continuous_action = [self.discretize_action(action)]


        # Step using the continuous action
        state, reward, done, _, info = self.env.step(continuous_action)

        if self.ticker < 1000:
            reward = state[0] * state[1] * 0.1 # moving up-hill is good
            
            if state[0] >= 0.45:
                reward += 100
        

        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        
        self.ticker += 1

        return padded_state, reward, done, info
    


    def discretize_action(self, action):
        # Ensure the action is within the valid range
        action = max(0, min(action, self.action_space - 1))
        return self.action_boundaries[action]

In [5]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, num_actions=2)


In [6]:
# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [7]:
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: 0.005652161398234069, PLoss: 0.051719777286052704, VLoss: 0.003804302541539073
Episode 10, Avg Reward: -29.589759890698904, PLoss: -54.414520263671875, VLoss: 4.941513538360596
Episode 20, Avg Reward: -29.11844565703296, PLoss: 60.2781982421875, VLoss: 10005.21875
Episode 30, Avg Reward: -32.05120512250634, PLoss: -53.95663833618164, VLoss: 4.829858303070068
Episode 40, Avg Reward: -36.428959970675635, PLoss: -53.58173751831055, VLoss: 4.768331050872803
Episode 50, Avg Reward: -39.089948211719715, PLoss: -53.16948699951172, VLoss: 4.700223922729492
Episode 60, Avg Reward: -39.17684194750343, PLoss: -52.72344207763672, VLoss: 4.6161065101623535
Episode 70, Avg Reward: -40.701230405601606, PLoss: -52.12405014038086, VLoss: 4.530447483062744
Episode 80, Avg Reward: -40.556634059231094, PLoss: -51.46949005126953, VLoss: 4.459988594055176
Episode 90, Avg Reward: -41.59436658019476, PLoss: -50.844058990478516, VLoss: 4.277740478515625
Episode 100, Avg Reward: -41.73093

KeyboardInterrupt: 

In [8]:
# Save the models
agent.save_models()

In [9]:
# plot results
import matplotlib.pyplot as plt

plt.plot(results['Episode'], results['Reward'], label='Reward')
plt.plot(results['Episode'], results['Average_100'], label='Average 100')
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()
plt.show()


NameError: name 'results' is not defined

In [None]:
np.save(f'results/{config["experiment"]}', results)