In [46]:
import time
import random
import numpy as np
from typing import Dict, List, Tuple

In [47]:
class GridWorldMDP:
    def __init__(self, size: int = 5, obstacles: List[Tuple[int, int]] = None):
        self.size = size
        self.states = [(i, j) for i in range(size) for j in range(size)]
        self.actions = ['up', 'down', 'left', 'right']
        
        # Randomly place start and goal states
        self.start_state = (random.randint(0, size-1), random.randint(0, size-1))
        self.goal_state = (random.randint(0, size-1), random.randint(0, size-1))
        
        # Make sure start and goal states are different
        while self.goal_state == self.start_state:
            self.goal_state = (random.randint(0, size-1), random.randint(0, size-1))
            
        # Set obstacles
        self.obstacles = obstacles if obstacles else []
        while len(self.obstacles) < size:  # Add random obstacles
            obs = (random.randint(0, size-1), random.randint(0, size-1))
            if obs not in self.obstacles and obs != self.start_state and obs != self.goal_state:
                self.obstacles.append(obs)

    def get_next_state(self, state: Tuple[int, int], action: str) -> Tuple[int, int]:
        """Determine next state given current state and action."""
        x, y = state
        if action == 'up':
            new_state = (max(0, x-1), y)
        elif action == 'down':
            new_state = (min(self.size-1, x+1), y)
        elif action == 'left':
            new_state = (x, max(0, y-1))
        else:  # right
            new_state = (x, min(self.size-1, y+1))
            
        # Check if new state is an obstacle
        if new_state in self.obstacles:
            return state
        return new_state

    def get_reward(self, state: Tuple[int, int]) -> float:
        """Get reward for being in a state."""
        if state == self.goal_state:
            return 100
        elif state in self.obstacles:
            return -50
        return -1  # Small negative reward for each step

In [48]:
def value_iteration(mdp: GridWorldMDP, gamma: float = 0.9, epsilon: float = 1e-6) -> Dict[Tuple[int, int], float]:
    """Perform value iteration to find optimal values for each state."""
    V = {state: 0 for state in mdp.states}
    while True:
        delta = 0
        V_new = V.copy()
        
        for state in mdp.states:
            if state == mdp.goal_state:
                continue
                
            # Find maximum value over all actions
            max_value = float('-inf')
            for action in mdp.actions:
                next_state = mdp.get_next_state(state, action)
                value = mdp.get_reward(next_state) + gamma * V[next_state]
                max_value = max(max_value, value)
            
            V_new[state] = max_value
            delta = max(delta, abs(V_new[state] - V[state]))
        
        V = V_new
        if delta < epsilon:
            break
    
    return V

In [49]:
def get_optimal_policy(mdp: GridWorldMDP, V: Dict[Tuple[int, int], float], gamma: float = 0.9) -> Dict[Tuple[int, int], str]:
    """Extract optimal policy from value function."""
    policy = {}
    
    for state in mdp.states:
        if state == mdp.goal_state:
            policy[state] = None
            continue
            
        best_action = None
        best_value = float('-inf')
        
        for action in mdp.actions:
            next_state = mdp.get_next_state(state, action)
            value = mdp.get_reward(next_state) + gamma * V[next_state]
            
            if value > best_value:
                best_value = value
                best_action = action
                
        policy[state] = best_action
    
    return policy

In [50]:
def simulate_episode(mdp: GridWorldMDP, policy: Dict[Tuple[int, int], str], max_steps: int = 100) -> Tuple[List[Tuple[int, int]], bool]:
    """Simulate one episode following the given policy with some randomness."""
    current_state = mdp.start_state
    path = [current_state]
    steps = 0
    
    while steps < max_steps:
        if current_state == mdp.goal_state:
            return path, True
            
        # Add some randomness to make it more dynamic
        if random.random() < 0.2:  # 20% chance of random action
            action = random.choice(mdp.actions)
        else:
            action = policy[current_state]
            
        current_state = mdp.get_next_state(current_state, action)
        path.append(current_state)
        steps += 1
    
    return path, False

In [51]:
def visualize_grid(mdp: GridWorldMDP, path: List[Tuple[int, int]] = None):
    """Visualize the grid world with the path taken."""
    grid = [[' ' for _ in range(mdp.size)] for _ in range(mdp.size)]
    
    # Place obstacles
    for obs in mdp.obstacles:
        grid[obs[0]][obs[1]] = '█'
    
    # Place start and goal
    grid[mdp.start_state[0]][mdp.start_state[1]] = 'S'
    grid[mdp.goal_state[0]][mdp.goal_state[1]] = 'G'
    
    # Place path
    if path:
        for state in path[1:-1]:  # Exclude start and goal states
            if state != mdp.goal_state and state != mdp.start_state:
                grid[state[0]][state[1]] = '•'
    
    # Print the grid
    print('\n'.join([' '.join(row) for row in grid]))

In [52]:
def run_simulation():
    """Run a complete simulation."""
    # Create MDP
    mdp = GridWorldMDP(size=4)
    
    print("Initial Grid:")
    visualize_grid(mdp)
    
    # Compute optimal values and policy
    print("\nComputing optimal policy...")
    values = value_iteration(mdp)
    policy = get_optimal_policy(mdp, values)
    
    # Simulate episode
    print("\nSimulating episode...")
    path, success = simulate_episode(mdp, policy)
    
    print("\nFinal Path:")
    visualize_grid(mdp, path)
    
    if success:
        print("\nSuccess! Agent reached the goal! 🎉")
    else:
        print("\nBetter luck next time! Agent didn't reach the goal in time. 😞")
    
    # Print some statistics
    print(f"\nPath length: {len(path)} steps")
    print(f"Start state: {mdp.start_state}")
    print(f"Goal state: {mdp.goal_state}")

In [53]:
# Run multiple simulations
def run_multiple_simulations(num_simulations: int = 5):
    """Run multiple simulations and track success rate."""
    successes = 0
    
    for i in range(num_simulations):
        print(f"\n=== Simulation {i+1} ===")
        mdp = GridWorldMDP(size=5)
        values = value_iteration(mdp)
        policy = get_optimal_policy(mdp, values)
        path, success = simulate_episode(mdp, policy)
        
        visualize_grid(mdp, path)
        if success:
            successes += 1
            print("\nSuccess! Agent reached the goal! 🎉")
        else:
            print("\nBetter luck next time! Agent didn't reach the goal in time. 😞")
        
        time.sleep(1)  # Add delay between simulations
    
    print(f"\nOverall Success Rate: {successes/num_simulations*100:.1f}%")

In [54]:
if __name__ == "__main__":
    # Run a single simulation
    print("=== Single Simulation ===")
    run_simulation()
    
    # Run multiple simulations
    print("\n=== Multiple Simulations ===")
    run_multiple_simulations(5)

=== Single Simulation ===
Initial Grid:
█      
      █
█ G   █
  S    

Computing optimal policy...

Simulating episode...

Final Path:
█      
      █
█ G   █
  S    

Success! Agent reached the goal! 🎉

Path length: 2 steps
Start state: (3, 1)
Goal state: (2, 1)

=== Multiple Simulations ===

=== Simulation 1 ===
    • G █
  █ •   █
█ • •    
  • •    
  S █    

Success! Agent reached the goal! 🎉

=== Simulation 2 ===
G • • █  
    •   █
    • █ S
  █ • • •
    █    

Success! Agent reached the goal! 🎉

=== Simulation 3 ===
         
      █  
█ █ █    
      G S
    █   •

Success! Agent reached the goal! 🎉

=== Simulation 4 ===
      █  
█   █    
  █   G  
      S  
█        

Success! Agent reached the goal! 🎉

=== Simulation 5 ===
         
█   G █ █
    S    
█        
    █    

Success! Agent reached the goal! 🎉

Overall Success Rate: 100.0%
