In [1]:
# Defines a RussellGrid Environment fully aligned with GYMNASIUM
# Develops Value Iteration and Policy Iteration with DynamicProgramming

In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [3]:
class RussellsGridEnv(gym.Env):
    metadata = {"render_modes": ["human", "ansi", "rgb_array"], "render_fps": 4}

    def __init__(self, render_mode=None):
        super(RussellsGridEnv, self).__init__()

        # Define the grid size
        self.height = 3
        self.width = 4
        self.black_cell = (1, 1)
        self.green_cell = (0, 3)
        self.red_cell   = (1, 3)
        self.grid_size  = (3, 4)
        
        

        # Define the action space
        self.action_space = spaces.Discrete(4)  # Up, Right, Down, Left

        # Define the observation space
        self.observation_space = spaces.Discrete(self.height * self.width)

        # Define the grid
        self.grid = np.zeros((self.height, self.width))
        self.grid[0, 3] = 1  # Green (terminal state)
        self.grid[1, 3] = 2  # Red (terminal state)
        self.grid[1, 1] = 3  # Black (impossible state)

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        # For rendering
        self.window = None
        self.clock = None
        # Transition probability matrix P
        self.P = self._build_transition_probabilities()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = 8  # Start at (2, 0)
        
        if self.render_mode == "human":
            self._render_frame()

        return self.state, {}

    def step(self, action):
        row, col = divmod(self.state, self.width)
        print(self.state, self.width, row, col)
        # Determine movement
        if np.random.random() < 0.8:  # 80% chance of intended movement
            if action == 0:  # Up
                row = max(0, row - 1)
            elif action == 1:  # Right
                col = min(self.width - 1, col + 1)
            elif action == 2:  # Down
                row = min(self.height - 1, row + 1)
            elif action == 3:  # Left
                col = max(0, col - 1)
        else:  # 20% chance of random adjacent movement
            directions = [(0, 1), (1, 0), (0, -1), (-1, 0)]
            dr, dc = directions[np.random.randint(4)]
            row = max(0, min(self.height - 1, row + dr))
            col = max(0, min(self.width - 1, col + dc))

        # Check if new state is valid
        if self.grid[row, col] != 3:  # Not the black cell
            self.state = row * self.width + col

        # Check for terminal states
        done = self.grid[row, col] in [1, 2]
        reward = 1 if self.grid[row, col] == 1 else (-1 if self.grid[row, col] == 2 else -0.04)

        if self.render_mode == "human":
            self._render_frame()

        return self.state, reward, done, False, {}

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()
        if self.render_mode == "ansi":
            return self._render_frame()

    def _render_frame(self):
        if self.render_mode == "human":
            return self._render_human()
        elif self.render_mode == "ansi":
            return self._render_ansi()
        elif self.render_mode == "rgb_array":
            return self._render_rgb_array()

    def _render_human(self):
        if self.window is None:
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode((self.width * 100, self.height * 100))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.width * 100, self.height * 100))
        canvas.fill((255, 255, 255))

        pix_square_size = 100  # The size of a single grid square in pixels

        # Draw the grid
        for i in range(self.height):
            for j in range(self.width):
                if self.grid[i, j] == 1:
                    color = (0, 255, 0)  # Green
                elif self.grid[i, j] == 2:
                    color = (255, 0, 0)  # Red
                elif self.grid[i, j] == 3:
                    color = (0, 0, 0)  # Black
                else:
                    color = (200, 200, 200)  # Light gray
                pygame.draw.rect(
                    canvas,
                    color,
                    pygame.Rect(
                        pix_square_size * j,
                        pix_square_size * i,
                        pix_square_size,
                        pix_square_size,
                    ),
                )

        # Draw the agent
        agent_row, agent_col = divmod(self.state, self.width)
        pygame.draw.circle(
            canvas,
            (0, 0, 255),  # Blue
            (agent_col * pix_square_size + pix_square_size // 2, agent_row * pix_square_size + pix_square_size // 2),
            pix_square_size // 3,
        )

        # Add gridlines
        for x in range(self.width + 1):
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.height * pix_square_size),
                width=3,
            )
        for y in range(self.height + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * y),
                (self.width * pix_square_size, pix_square_size * y),
                width=3,
            )

        self.window.blit(canvas, canvas.get_rect())
        pygame.event.pump()
        pygame.display.update()

        self.clock.tick(self.metadata["render_fps"])

    def _render_ansi(self):
        output = ""
        for i in range(self.height):
            for j in range(self.width):
                if i * self.width + j == self.state:
                    output += "A "
                elif self.grid[i, j] == 1:
                    output += "G "
                elif self.grid[i, j] == 2:
                    output += "R "
                elif self.grid[i, j] == 3:
                    output += "B "
                else:
                    output += ". "
            output += "\n"
        print(output)

    def _render_rgb_array(self):
        canvas = np.zeros((self.height * 100, self.width * 100, 3), dtype=np.uint8)

        pix_square_size = 100  # The size of a single grid square in pixels

        # Draw the grid
        for i in range(self.height):
            for j in range(self.width):
                if self.grid[i, j] == 1:
                    color = [0, 255, 0]  # Green
                elif self.grid[i, j] == 2:
                    color = [255, 0, 0]  # Red
                elif self.grid[i, j] == 3:
                    color = [0, 0, 0]  # Black
                else:
                    color = [200, 200, 200]  # Light gray
                canvas[i*pix_square_size:(i+1)*pix_square_size, j*pix_square_size:(j+1)*pix_square_size] = color

        # Draw the agent
        agent_row, agent_col = divmod(self.state, self.width)
        rr, cc = np.ogrid[
            (agent_row * pix_square_size + pix_square_size // 2 - pix_square_size // 3):
            (agent_row * pix_square_size + pix_square_size // 2 + pix_square_size // 3),
            (agent_col * pix_square_size + pix_square_size // 2 - pix_square_size // 3):
            (agent_col * pix_square_size + pix_square_size // 2 + pix_square_size // 3)
        ]
        canvas[rr, cc] = [0, 0, 255]  # Blue

        return canvas
    

    def _build_transition_probabilities(self):
        P = {}

        for s in range(self.height * self.width):
            row, col = divmod(s, self.width)
            if (row, col) == self.black_cell:
                P[s] = {a: [(1.0, s, -0.1, False)] for a in range(4)}  # Impassable state
                continue

            P[s] = {a: [] for a in range(4)}
            for action in range(4):
                transitions = []
                if action == 0:  # Up
                    next_state = (max(row - 1, 0), col)
                elif action == 1:  # Right
                    next_state = (row, min(col + 1, self.width - 1))
                elif action == 2:  # Down
                    next_state = (min(row + 1, self.height - 1), col)
                elif action == 3:  # Left
                    next_state = (row, max(col - 1, 0))
                
                next_s = next_state[0] * self.width + next_state[1]
                if next_state != (row, col) and self.grid[next_state] != 3:
                    transitions.append((0.8, next_s, self._get_reward(next_state), self._is_done(next_state)))

                # Random adjacent cells with 0.2 probability
                directions = [(0, 1), (1, 0), (0, -1), (-1, 0)]
                for dr, dc in directions:
                    rand_state = (max(0, min(row + dr, self.height - 1)), max(0, min(col + dc, self.width - 1)))
                    rand_s = rand_state[0] * self.width + rand_state[1]
                    if rand_state != next_state and self.grid[rand_state] != 3:
                        transitions.append((0.2 / 3, rand_s, self._get_reward(rand_state), self._is_done(rand_state)))

                if not transitions:
                    transitions.append((1.0, s, -0.1, False))

                P[s][action] = transitions

        return P

    def _get_reward(self, state):
        if self.grid[state] == 3:
            return 1.0
        elif self.grid[state] == 7:
            return -1.0
        else:
            return -0.04

    def _is_done(self, state):
        return self.grid[state] in [3, 7]

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = 8  # Start at (0, 0)
        if self.render_mode == "human":
            self._render_frame()
        return self.state, {}

    def step1(self, action):
        transitions = self.P[self.state][action]
        i = np.random.choice(len(transitions), p=[t[0] for t in transitions])
        prob, next_state, reward, done = transitions[i]
        self.state = next_state
        if self.render_mode == "human":
            self._render_frame()
        return self.state, reward, done, False, {}
    
    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

# Register the environment
gym.register(
    id='RussellsGrid-v0',
    entry_point='__main__:RussellsGridEnv',
)

In [5]:
    env = gym.make('RussellsGrid-v0', render_mode='ansi')
    env.reset()
    env.render()

    done = False

. . . G 
. B . R 
A . . . 



  logger.warn(


In [6]:
import session_info
session_info.show(html=False)

-----
gymnasium           0.29.1
numpy               1.26.4
session_info        1.0.0
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]
Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
-----
Session information updated at 2024-10-28 13:15
