In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pygame
from torch_geometric.data import Data
import logging

logger = logging.getLogger(__name__)

class CustomEnv(gym.Env):
    def __init__(self, game_manager_args, simulation_manager_args, model_args, plot=False):
        super(CustomEnv, self).__init__()
        logger.info("Initializing CustomEnv")
        # ... (previous initialization code remains the same)
        
        self.max_episode_steps = 1000  # Maximum number of steps per episode
        self.episode_step = 0
        self.total_reward = 0
        logger.info("CustomEnv initialized successfully")

    def reset(self, seed=None, options=None):
        logger.info("Resetting environment")
        self.episode_step = 0
        self.total_reward = 0
        self.outposts_visited.clear()
        self.early_stop = False
        # ... (rest of the reset code remains the same)
        logger.info(f"Environment reset complete. Initial agent position: {(self.agent_controler.agent.grid_x, self.agent_controler.agent.grid_y)}")
        return observation, {}

    def step(self, action):
        logger.debug(f"Taking step {self.episode_step} with action {action}")
        self.episode_step += 1
        
        prev_position = (self.agent_controler.agent.grid_x, self.agent_controler.agent.grid_y)
        self.agent_controler.agent_action(action)
        new_position = (self.agent_controler.agent.grid_x, self.agent_controler.agent.grid_y)
        
        self.current_gm.rerender()
        reward = self._calculate_reward()
        self.total_reward += reward
        
        terminated = self.early_stop or self.episode_step >= self.max_episode_steps
        truncated = self.episode_step >= self.max_episode_steps
        
        observation = self._get_observation()
        info = {
            "episode_step": self.episode_step,
            "prev_position": prev_position,
            "new_position": new_position,
            "energy_spent": self.agent_controler.energy_spent,
            "outposts_visited": len(self.outposts_visited),
            "total_reward": self.total_reward
        }
        
        logger.debug(f"Step complete. Reward: {reward}, Total Reward: {self.total_reward}, Terminated: {terminated}, Truncated: {truncated}, Info: {info}")
        
        if terminated or truncated:
            logger.info(f"Episode ended. Total steps: {self.episode_step}, Total reward: {self.total_reward}, Outposts visited: {len(self.outposts_visited)}/{len(self.outpost_coords)}")
        
        return observation, reward, terminated, truncated, info

    def _calculate_reward(self):
        logger.debug("Calculating reward")
        agent_pos = (self.agent_controler.agent.grid_x, self.agent_controler.agent.grid_y)
        reward = 0

        if agent_pos in self.outpost_coords and agent_pos not in self.outposts_visited:
            reward += self.new_outpost_reward
            self.outposts_visited.add(agent_pos)
            logger.debug(f"Agent reached new outpost. Reward: {self.new_outpost_reward}")

            if len(self.outposts_visited) == len(self.outpost_coords):
                reward += self.completion_reward
                logger.debug(f"All outposts visited. Additional reward: {self.completion_reward}")
                self.early_stop = True

        else:
            unvisited_outposts = [outpost for outpost in self.outpost_coords if outpost not in self.outposts_visited]
            if unvisited_outposts:
                prev_min_distance = self.previous_min_distance if hasattr(self, 'previous_min_distance') else float('inf')
                current_min_distance = min(manhattan_distance(agent_pos, outpost) for outpost in unvisited_outposts)
                
                if current_min_distance < prev_min_distance:
                    reward += self.closer_to_outpost_reward
                    logger.debug(f"Agent moved closer to an outpost. Reward: {self.closer_to_outpost_reward}")
                elif current_min_distance > prev_min_distance:
                    reward += self.farther_from_outpost_penalty
                    logger.debug(f"Agent moved away from outposts. Penalty: {self.farther_from_outpost_penalty}")
                
                self.previous_min_distance = current_min_distance
            else:
                reward += self.penalty
                logger.debug(f"No unvisited outposts. Penalty: {self.penalty}")

        logger.debug(f"Calculated reward: {reward}, Outposts visited: {len(self.outposts_visited)}/{len(self.outpost_coords)}")
        return reward

    # ... (rest of the methods remain the same)

In [None]:
import os
import torch
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from custom_env import CustomEnv
from agent_model import AgentModel
import cProfile
import pstats
import logging
import traceback
import time
import warnings

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Capture warnings
warnings.filterwarnings("always")

os.environ['PYGAME_DETECT_AVX2'] = '1'

# ... (rest of the setup code remains the same)

def calculate_mean_reward(ep_info_buffer):
    if len(ep_info_buffer) > 0:
        return np.mean([ep_info["r"] for ep_info in ep_info_buffer])
    return 0.0

def calculate_mean_episode_length(ep_info_buffer):
    if len(ep_info_buffer) > 0:
        return np.mean([ep_info["l"] for ep_info in ep_info_buffer])
    return 0.0

def main():
    try:
        logger.info("Starting main function")
        
        # ... (rest of the setup code remains the same)

        # Run the training
        logger.info("Starting model training")
        total_timesteps = 1000000
        timeout = 3600  # 1 hour timeout
        start_time = time.time()

        try:
            for i in range(0, total_timesteps, 2048):  # 2048 is the n_steps parameter
                logger.info(f"Training iteration {i//2048 + 1}, timesteps {i}-{min(i+2048, total_timesteps)}")
                
                # Check for timeout
                if time.time() - start_time > timeout:
                    logger.warning("Training timed out after 1 hour")
                    break

                with warnings.catch_warnings(record=True) as w:
                    rl_model.learn(total_timesteps=2048, callback=eval_callback, reset_num_timesteps=False)
                    if len(w) > 0:
                        logger.warning(f"Warnings during training: {[str(warn.message) for warn in w]}")

                logger.info(f"Completed training iteration {i//2048 + 1}")

                # Log some training statistics
                mean_reward = calculate_mean_reward(rl_model.ep_info_buffer)
                mean_episode_length = calculate_mean_episode_length(rl_model.ep_info_buffer)
                logger.info(f"Recent mean reward: {mean_reward:.2f}")
                logger.info(f"Recent mean episode length: {mean_episode_length:.2f}")

                # Log the size of the episode info buffer
                logger.info(f"Episode info buffer size: {len(rl_model.ep_info_buffer)}")

                # Log a sample of the episode info buffer
                if len(rl_model.ep_info_buffer) > 0:
                    logger.info(f"Sample episode info: {rl_model.ep_info_buffer[-1]}")

        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            logger.error(traceback.format_exc())
        
        logger.info("Model training completed or stopped")

        # ... (rest of the evaluation and cleanup code remains the same)

    except Exception as e:
        logger.error(f"An error occurred in the main function: {str(e)}")
        logger.error(traceback.format_exc())

if __name__ == '__main__':
    main()

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pygame
from torch_geometric.data import Data
import logging

logger = logging.getLogger(__name__)

class CustomEnv(gym.Env):
    def __init__(self, game_manager_args, simulation_manager_args, model_args, plot=False):
        super(CustomEnv, self).__init__()
        logger.info("Initializing CustomEnv")
        # ... (rest of the initialization code remains the same)
        self.step_count = 0
        logger.info("CustomEnv initialized successfully")

    # ... (other methods remain the same)

    def reset(self, seed=None, options=None):
        logger.info("Resetting environment")
        self.step_count = 0
        # ... (rest of the reset code remains the same)
        logger.info(f"Environment reset complete. Initial observation: {observation}")
        return observation, {}

    def step(self, action):
        logger.debug(f"Taking step {self.step_count} with action {action}")
        self.step_count += 1
        self.agent_controler.agent_action(action)
        self.current_gm.rerender()
        reward = self._calculate_reward()
        terminated = self.early_stop
        truncated = False
        observation = self._get_observation()
        info = {
            "step_count": self.step_count,
            "agent_position": (self.agent_controler.agent.grid_x, self.agent_controler.agent.grid_y),
            "energy_spent": self.agent_controler.energy_spent,
            "outposts_visited": len(self.outposts_visited)
        }
        logger.debug(f"Step complete. Reward: {reward}, Terminated: {terminated}, Truncated: {truncated}, Info: {info}")
        return observation, reward, terminated, truncated, info

    def _calculate_reward(self):
        logger.debug("Calculating reward")
        # ... (rest of the method code remains the same)
        logger.debug(f"Calculated reward: {self.current_reward}, Outposts visited: {len(self.outposts_visited)}/{len(self.outpost_coords)}")
        return self.current_reward

    # ... (rest of the class methods remain the same)