In [None]:
"""
TODO:
- Test update policy every 1 step
- try a network with smaller value network and bigger policy
- try a network with bigger value network and smaller policy
- learning rates
"""

In [6]:
import gymnasium as gym
import numpy as np
from actor_critic_agent import ActorCriticAgent
from env_wrapper import EnvironmentWrapper

In [7]:

config = {
    'experiment': 'MountainCarContinuous',
    'device': 'cuda',
    'state_size': 6, 
    'action_size': 3,
    'hidden_sizes': [64, 64], 
    'lr_actor': 0.001,
    'lr_critic': 0.0005,
    'verbosity': 10,
    'env_name': 'MountainCarContinuous-v0',
    'gamma': 0.99, 
    'reward_threshold': 80.0,
    'max_episodes': 5000,
    'max_steps': 500,
    'update_frequency': 500,
    'discrete': True

}

In [8]:
class MCCWrapper(EnvironmentWrapper):
    def __init__(self, env, num_actions=3):
        super().__init__(env)
        self.action_space = num_actions
        # Define the discrete action boundaries
        self.action_boundaries = np.linspace(-1, 1, num_actions)
        self.ticker = 0
        self.succeeded = 0


    def step(self, action):
        # Convert the discrete action into a continuous action
        continuous_action = [self.discretize_action(action)]


        # Step using the continuous action
        state, reward, done, _, info = self.env.step(continuous_action)

        if self.succeeded < 10: # handicap
            reward = abs(state[1]) * 0.1 # moving is good
            
            if state[0] >= 0.45:
                reward += 100
                self.succeeded += 1
        

        # Pad state to match target_state_size
        padded_state = np.append(state, np.zeros(self.target_state_size - len(state)))
        
        self.ticker += 1

        return padded_state, reward, done, info
    


    def discretize_action(self, action):
        # Ensure the action is within the valid range
        action = max(0, min(action, self.action_space - 1))
        return self.action_boundaries[action]

In [9]:
# Initialize the environment
env = gym.make(config['env_name'])
env_wrapper = MCCWrapper(env, num_actions=3)


In [10]:
# Initialize the ActorCriticAgent
agent = ActorCriticAgent(config)

In [11]:
results = agent.train(env_wrapper,
 max_episodes=config['max_episodes'],
 max_steps=config['max_steps'],
 reward_threshold=config['reward_threshold'],
 update_frequency=config['update_frequency'])

Episode 0, Avg Reward: 0.4310501144690361, PLoss: 1.0300841331481934, VLoss: 0.0019827433861792088
Episode 10, Avg Reward: 0.6139149357135726, PLoss: 0.6070703268051147, VLoss: 0.0010550760198384523
Episode 20, Avg Reward: 5.354370473145429, PLoss: 0.01802113465964794, VLoss: 0.00042641093023121357
Episode 30, Avg Reward: 3.7900152947757895, PLoss: 0.05094999819993973, VLoss: 0.0009404005249962211
Episode 40, Avg Reward: 2.9894814574406428, PLoss: -0.42732107639312744, VLoss: 0.0004448716063052416
Episode 50, Avg Reward: 2.5133931816800463, PLoss: 0.05232150852680206, VLoss: 0.0004373435222078115
Episode 60, Avg Reward: 2.1828055938089808, PLoss: -0.27114614844322205, VLoss: 0.00040682716644369066
Episode 70, Avg Reward: 1.9579469788191963, PLoss: 0.11124685406684875, VLoss: 0.0004946996923536062
Episode 80, Avg Reward: 1.7852081772919222, PLoss: -0.3179815113544464, VLoss: 0.0002451833279337734
Episode 90, Avg Reward: 1.6514990069823274, PLoss: -0.2750757038593292, VLoss: 0.0002058357

KeyboardInterrupt: 

In [None]:
# Save the models
agent.save_models()

In [None]:
# plot results
import matplotlib.pyplot as plt

plt.plot(results['Episode'], results['Reward'], label='Reward')
plt.plot(results['Episode'], results['Average_100'], label='Average 100')
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()
plt.show()


NameError: name 'results' is not defined

In [None]:
np.save(f'results/{config["experiment"]}', results)