Segunda Tentativa! O meu código atem um loop infito algures, não consigo imprimir os rewards por resultado!

In [16]:
pip install highway-env 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

In [71]:
class Node:
    def __init__(self, state, agent, parent=None, exploration=1.0):
        self.visits = 0
        self.reward = 0
        self.children = []
        self.parent = parent
        self.state = state
        self.agent = agent
        self.exploration = exploration
    
    def fully_expand(self):
        return len(self.children) == len(self.get_possible_actions())
    
    def upper_confidence_bound(self):
        ucb = [(child.reward / child.visits) + self.exploration * (np.sqrt(np.log(self.visits) / child.visits))
               if child.visits > 0 else float('inf')
               for child in self.children]
        return self.children[np.argmax(ucb)]
    
    def select(self):
        #while not self.is_terminal():
        if not self.fully_expand():
            return self.expand()
        else:
            selected_child = self.upper_confidence_bound()
            return selected_child.select()
    
    def expand(self):
        actions = self.get_possible_actions()
        for action in actions:
            if action not in self.children:
                next_state, reward, done = self.apply_action(action)
                child_node = Node(parent=self, state=next_state, agent=self.agent, exploration=self.exploration)
                self.children.append(child_node)
        return self.children[0] if self.children else self
    
    def get_possible_actions(self):
        return self.agent.get_actions(self.state)

    def apply_action(self, action):
        step_result = self.agent.apply_action(self.state, action)
        next_state, reward, done = step_result[:3]
        return next_state, reward, done

    def reward(self):
        return self.agent.reward(self.state)
    
    #Backpropagate the reward back to the parent node
    def back_propagate(self, reward):
        self.visits += 1
        self.reward += reward

        if self.parent:
            self.parent.back_propagate(reward)

    #def is_terminal(self):
    #    return self.agent.is_terminal(self.state)


In [68]:
class MCTSAgent:
    def __init__(self, env, configuration):
        self.env = env
        self.current_state = self.env.reset()
        self.configuration = configuration

    def get_actions(self, state):
        return list(range(self.env.action_space.n))
    
    def apply_action(self, state, action):
        step_result = self.env.step(action)
        next_state, reward, done, _ = step_result[:4]
        return next_state, reward, done
    
    def reward(self, state):
        rewards = state[1]['rewards']
        total_reward = rewards['collision_reward'] + rewards['right_lane_reward'] + rewards['high_speed_reward'] + rewards['on_road_reward']
        return total_reward

    #def is_terminal(self, state):
    #    return state[1]['crashed']

    def reset(self):
        self.current_state = self.env.reset()
        return self.current_state
    
    def close(self):
        self.env.close()


In [59]:
class MCTS:
    def __init__(self, agent, iterations):
        self.agent = agent
        self.iterations = iterations

    def simulate(self, node):
        total_reward = 0
        done = False
        state = node.state
        self.agent.reset()

        while not done:
            action = np.random.choice(self.agent.get_actions(state))
            next_state, reward, done = self.agent.apply_action(state, action)
            total_reward += reward
            state = next_state
        return total_reward
    
    def search(self, initial_state):
        root = Node(initial_state, self.agent)
        for _ in range(self.iterations):
            node = root.select()
            reward = self.simulate(node)
            node.back_propagate(reward)
        return self.best_child(root)

    def best_child(self, node):
        return max(node.children, key=lambda child: child.upper_confidence_bound())

In [34]:
configuration = {

    # Parametrization bellow cannot be changed
    "lanes_count": 4, # The environment must always have 10 lanes
    "vehicles_count": 50, # The environment must always have 50 other vehicles
    "duration": 120,  # [s] The environment must terminate never before 120 seconds
    "other_vehicles_type": "highway_env.vehicle.behavior.IDMVehicle", # This is the policy of the other vehicles
    "initial_spacing": 2, # Initial spacing between vehicles needs to be at most 2

    # Refer to refer to https://highway-env.farama.org/observations/ to change observation space type
    "observation": {
        "type": "Kinematics"
    },

    # Refer to refer to https://highway-env.farama.org/actions/ to change action space type
    "action": {
        "type": "DiscreteMetaAction",
    },

    # Parameterization bellow can be changed (as it refers mostly to the reward system)
    "collision_reward": -1,  # The reward received when colliding with a vehicle. (Can be changed)
    "reward_speed_range": [20, 30],  # [m/s] The reward for high speed is mapped linearly from this range to [0, HighwayEnv.HIGH_SPEED_REWARD]. (Can be changed)
    "simulation_frequency": 15,  # [Hz] (Can be changed)
    "policy_frequency": 1,  # [Hz] (Can be changed)
    
    # Parameters defined bellow are purely for visualiztion purposes! You can alter them as you please
    "screen_width": 800,  # [px]
    "screen_height": 600,  # [px]
    "centering_position": [0.5, 0.5],
    "scaling": 5,
    "show_trajectories": False,
    "render_agent": True,
    "offscreen_rendering": False
}

In [78]:
def run_mcts_on_highway(agent, n_iterations):
    initial_state = agent.reset()  # Reset the environment and get the initial state
    mcts = MCTS(agent, n_iterations)
    best_node = mcts.search(initial_state)
    return best_node.state[1]['action']

def train_agent(agent, n_episodes, n_iterations):
    for episode in range(n_episodes):
        agent.reset()
        done = False
        while not done:
            action = run_mcts_on_highway(agent, n_iterations) #Não estou a conseguir sair daqui
            #Imprimeir coisas para ver de onde está a surgir o loop infinito (classes)
            _, _, done = agent.apply_action(agent.current_state, action)
        print(f"Episode {episode + 1} finished")

n_episodes = 10  # Number of episodes to train
n_iterations = 1000  # Number of MCTS iterations per decision

agent = MCTSAgent(env, configuration)

# Train the agent
train_agent(agent, n_episodes, n_iterations)


KeyboardInterrupt: 

In [23]:
env = gym.make("highway-v0", config=configuration, render_mode="human")

In [72]:
def evaluate_agent(agent, n_episodes):
    total_rewards = []
    for episode in range(n_episodes):
        agent.reset()
        done = False
        episode_reward = 0
        while not done:
            action = run_mcts_on_highway(agent, n_iterations)
            _, reward, done = agent.apply_action(agent.current_state, action)
            episode_reward += reward
        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward}")
    average_reward = np.mean(total_rewards)
    print(f"Average reward over {n_episodes} episodes: {average_reward}")

KeyboardInterrupt: 

In [45]:
agent = MCTSAgent(env, configuration)
n_iterations = 1000
n_eval_episodes = 10
evaluate_agent(agent, n_eval_episodes)

AttributeError: 'Node' object has no attribute 'back_propagate'

In [44]:
plt.plot(train_agent(agent, n_episodes, n_iterations))
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('SARSA Training Progress')
plt.show()

AttributeError: 'Node' object has no attribute 'back_propagate'