In [6]:
import pygame
import numpy as np
import pickle
import random

# Define colors
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
GREEN = (0, 150, 0)  # Darker shade of green
RED = (255, 0, 0)
BLUE = (0, 0, 255)

# Gridworld dimensions
n = 5  # Number of rows
m = 5  # Number of columns

# Initialize Pygame
pygame.init()
screen = pygame.display.set_mode((650, 700))  # Set screen size to 650x700 to accommodate log area and wider grid
pygame.display.set_caption("Gridworld Reinforcement Learning")

# Font for text display
font = pygame.font.SysFont('comicsansms', 20)  # Change font to Arial and reduce size for better fit

# Cell size
cell_size = 120  # Increase cell size for better visibility

# Logging
log = []

def draw_gridworld(grid, V=None):
    """
    Draws the gridworld on the screen with rewards, values (optional), and log entries.
    """
    screen.fill(WHITE)  # Fill screen with white color
    for i in range(n):
        for j in range(m):
            # Draw border around each cell
            pygame.draw.rect(screen, BLACK, (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            # Color cells based on grid values
            if grid[i, j] == 'X':  # Obstacle
                color = BLACK
                pygame.draw.rect(screen, color, (j * cell_size, i * cell_size, cell_size, cell_size))  # Draw obstacle background
            elif grid[i, j] == '1':  # Goal
                color = GREEN
                pygame.draw.rect(screen, GREEN, (j * cell_size + 1, i * cell_size + 1, cell_size - 2, cell_size - 2))  # Dark green for goal state
                
            else:
                color = WHITE

            # Draw value on top of the grid 
            if V is not None:
                value_text = font.render(str(round(V[i, j], 2)), True, BLACK)
                text_rect = value_text.get_rect(center=(j * cell_size + cell_size // 2, i * cell_size + cell_size // 2))  # Center-align text
                screen.blit(value_text, text_rect)

            # Draw reward text in the center of the cell
            reward_text = font.render(str(grid[i, j]), True, color)
            reward_text_rect = reward_text.get_rect(center=(j * cell_size + cell_size // 2, i * cell_size + cell_size // 2))
            screen.blit(reward_text, reward_text_rect)

    # Draw log text
    log_text = font.render("Log:", True, BLACK)
    screen.blit(log_text, (10, n * cell_size + 20))
    log_area_height = 150
    log_area = pygame.Rect(10, (n * cell_size + 50), 630, log_area_height)  # Define log area rectangle
    pygame.draw.rect(screen, WHITE, log_area)  # Draw log area background
    for index, entry in enumerate(log[-5:]):  # Display only the last 5 log entries
        log_entry = font.render(entry, True, BLACK)
        screen.blit(log_entry, (20, (n * cell_size + 70) + index * 25))

def draw_agent(agent_pos):
    """
    Draws the agent on the screen.
    """
    pygame.draw.circle(screen, RED, (agent_pos[1] * cell_size + cell_size // 2,
                                     agent_pos[0] * cell_size + cell_size // 2), cell_size // 3)
    # Draw a dot in the center of the agent's cell
    pygame.draw.circle(screen, BLACK, (agent_pos[1] * cell_size + cell_size // 2,
                                       agent_pos[0] * cell_size + cell_size // 2), 3)

def get_valid_actions(grid, state):
    """
    Returns a list of valid actions from the given state (considering obstacles).
    """
    actions = ['up', 'down', 'left', 'right']
    valid_actions = []
    for action in actions:
        new_pos = get_next_state(state, action)
        if new_pos is not None and grid[new_pos[0], new_pos[1]] != 'X':
            valid_actions.append(action)
    return valid_actions

def value_iteration(grid, start, goal, discount_factor=0.9):
    """
    Performs Value Iteration to calculate the state value table, addressing invalid moves.
    """
    n = grid.shape[0]
    m = grid.shape[1]
    V = np.zeros((n, m))

    while True:
        delta = 0
        for i in range(n):
            for j in range(m):
                if grid[i, j] == 'X':
                    continue

                v_old = V[i, j]
                expected_rewards = []
                valid_actions = get_valid_actions(grid, (i, j))

                # Reward based on distance to goal (replace with your desired reward function)
                distance_to_goal = np.abs(i - (n - 1)) + np.abs(j - (m - 1))
                reward = 1.0 - distance_to_goal / ((n - 1) + (m - 1))  # Higher reward closer to goal

                # Only consider valid actions to avoid invalid transitions
                for action in valid_actions:
                    new_pos = get_next_state((i, j), action)
                    if new_pos == goal:
                        expected_rewards.append(0)  # Goal state has no immediate reward
                    elif new_pos == (i, j):  # Penalize revisiting the same state (increased penalty)
                        expected_rewards.append(-1.0)  # Adjust penalty as needed
                    else:
                        expected_rewards.append(reward + discount_factor * V[new_pos[0], new_pos[1]])

                # Update value based on valid actions only
                if len(expected_rewards) > 0:
                    V[i, j] = np.max(expected_rewards)
                else:
                    # Handle states with no valid actions (e.g., surrounded by obstacles)
                    V[i, j] = v_old  # Maintain current value (avoid negative updates)

                delta = max(delta, abs(v_old - V[i, j]))

        if delta < 1e-8:  # Epsilon for convergence criteria (adjust as needed)
            break

    return V

def get_next_state(state, action):
    """
    Returns the next state after taking an action from the current state,
    handling boundary conditions.
    """
    if action == 'up':
        new_row_index = max(state[0] - 1, 0)
        new_col_index = state[1]
    elif action == 'down':
        new_row_index = min(state[0] + 1, n - 1)
        new_col_index = state[1]
    elif action == 'left':
        new_row_index = state[0]
        new_col_index = max(state[1] - 1, 0)
    elif action == 'right':
        new_row_index = state[0]
        new_col_index = min(state[1] + 1, m - 1)

    return (new_row_index, new_col_index)

def get_action_index(action):
    """
    Returns the index corresponding to a given action.
    """
    actions = ['up', 'down', 'left', 'right']
    return actions.index(action)

# Placeholder grid (modify with appropriate rewards)
grid = np.array([
    [0.2, 0.3, 0.8, 0.1, 0],
    [0.5, 'X', 0.7, 0.4, 0],
    [0.9, 0.1, 0.6, 'X', 0.2],
    [0.4, 'X', 0.3, 0.5, 0.8],
    [0.7, 0.6, 0.1, 0.9, 1]  # Setting the goal state to 1
])

# Main loop
running = True
agent_pos = (0, 0)  # Initial agent position
training_step_frequency = 10  # Train every 10 frames
training_active = True  # Flag for training (optional)
frame_count = 0

# Additional variables for saving the model
policy = {}  # Dictionary to store optimal actions for each state (learned from Value Iteration)
model_saved = False  # Flag to track if the model is saved

epsilon = 0.5  # Initial epsilon for exploration (adjust as needed)
decay_rate = 0.95  # Initial decay rate for epsilon (adjust as needed)  # Steeper initial decay

status_text = font.render("Training Status:", True, BLACK)

while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    # Update logic (separate training and visualization)
    if training_active:
        # Perform Value Iteration training step
        V = value_iteration(grid.copy(), start=(0, 0), goal=(n - 1, m - 1))

        # Extract optimal policy from the state value table (V)
        policy = {}
        for i in range(n):
            for j in range(m):
                if grid[i, j] == 'X':
                    continue
                valid_actions = get_valid_actions(grid, (i, j))
                if random.random() < epsilon:  # Epsilon-greedy strategy
                    best_action = random.choice(valid_actions)  # Randomly select an action (exploration)
                else:
                    # Get action indices within bounds
                    valid_action_indices = [get_action_index(action) for action in valid_actions]
                    valid_action_indices = [idx for idx in valid_action_indices if j + idx < m]
                    best_action_index = np.argmax([V[i, j + idx] for idx in valid_action_indices])
                    best_action = valid_actions[best_action_index]
                policy[(i, j)] = best_action  # Store optimal action for the state

        # Adjust epsilon decay (steeper initial decay, slower later)
        epsilon *= max(0.1, decay_rate)  # Ensure epsilon doesn't reach zero
        decay_rate *= 0.99  # Gradual decay after initial phase

        # Choose action based on the learned policy
        current_action = policy.get(agent_pos, random.choice(['up', 'down', 'left', 'right']))  # Access action from policy dictionary

        # Update agent position based on the action
        new_pos = get_next_state(agent_pos, current_action)
        if new_pos != agent_pos:  # Check if the agent's position has changed
            agent_pos = new_pos
            # Log action, reward, and current state
            reward = grid[agent_pos[0], agent_pos[1]]  # Obtain reward from grid
            log.append(f"Action: {current_action}, Reward: {reward}, Current State: {agent_pos}")

    # Render the environment
    draw_gridworld(grid, V=V)  # Optional: Visualize the state-value table (V)
    draw_agent(agent_pos)
    pygame.display.update()

    # Training frequency control
    if frame_count % training_step_frequency == 0:
        training_active = True  # Trigger training step (optional)
    else:
        training_active = False

    frame_count += 1

# Quit pygame and cleanup
pygame.quit()

# Save the learned policy (optional)
with open("policy.pkl", "wb") as f:
    pickle.dump(policy, f)
