In [None]:
"""
TODO:
- Test update policy every 1 step
- try a network with smaller value network and bigger policy
- try a network with bigger value network and smaller policy
- learning rates
"""

In [4]:
from actorcritic import ActorCriticAgent, EnvironmentWrapper
import gymnasium as gym
import numpy as np



In [5]:

config = {
    'experiment': 'MountainCarContinuous',
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.0005,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 80.0,
    'max_episodes': 5000,
    'max_steps': 500,
    'update_frequency': 500,
    'discrete': True

}

In [22]:
class MCCWrapper(EnvironmentWrapper):
    def __init__(self, env, num_actions=3):
        super().__init__(env)
        self.action_space = num_actions
        # Define the discrete action boundaries
        self.action_boundaries = np.linspace(-1, 1, num_actions)
        self.ticker = 0
        self.succeeded = 0


    def step(self, action):
        # Convert the discrete action into a continuous action
        continuous_action = [self.discretize_action(action)]


        # Step using the continuous action
        state, reward, done, _, info = self.env.step(continuous_action)

        if self.succeeded < 10: # handicap
            reward = abs(state[1]) * 0.1 # moving is good
            
            if state[0] >= 0.45:
                reward += 100
                self.succeeded += 1
        

        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        
        self.ticker += 1

        return padded_state, reward, done, info
    


    def discretize_action(self, action):
        # Ensure the action is within the valid range
        action = max(0, min(action, self.action_space - 1))
        return self.action_boundaries[action]

In [23]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, num_actions=3)


In [24]:
# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [25]:
results = agent.train(env_wrapper, max_episodes=config['max_episodes'], max_steps=config['max_steps'], reward_threshold=config['reward_threshold'], update_frequency=config['update_frequency'])

Episode 0, Avg Reward: 0.4180766788389978, PLoss: 0.24419857561588287, VLoss: 0.0004944583051837981
Episode 10, Avg Reward: 0.6881008790885941, PLoss: 0.0584699921309948, VLoss: 0.0004620781692210585
Episode 20, Avg Reward: 0.636839961839267, PLoss: 0.037031374871730804, VLoss: 0.000879179744515568
Episode 30, Avg Reward: 0.6109605053597765, PLoss: -0.25794145464897156, VLoss: 0.000250643992330879
Episode 40, Avg Reward: 0.6175547899389272, PLoss: -0.1533486396074295, VLoss: 0.00020580833370331675
Episode 50, Avg Reward: 0.5975033236424929, PLoss: -0.25046008825302124, VLoss: 0.0002084196894429624
Episode 60, Avg Reward: 0.5747839960813022, PLoss: -0.2340220957994461, VLoss: 0.00021947978530079126
Episode 70, Avg Reward: 0.5789288467937755, PLoss: 0.17237350344657898, VLoss: 0.0007483710069209337
Episode 80, Avg Reward: 0.584081420450645, PLoss: -0.10213816165924072, VLoss: 0.0001845803635660559
Episode 90, Avg Reward: 0.5740652143469852, PLoss: -0.13329079747200012, VLoss: 0.000254740

In [None]:
# Save the models
agent.save_models()

In [None]:
# plot results
import matplotlib.pyplot as plt

plt.plot(results['Episode'], results['Reward'], label='Reward')
plt.plot(results['Episode'], results['Average_100'], label='Average 100')
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()
plt.show()


NameError: name 'results' is not defined

In [None]:
np.save(f'results/{config["experiment"]}', results)