# Grid World
Grid World: A 5x5 grid.
States: Each cell in the grid is a state the agent can be in.
Actions: The agent can move in four directions: up (0), down (1), left (2), and right (3).
Goal: The goal is at (4, 4), providing a positive reward.
Penalty: There's a penalty state at (3, 3) to avoid.
Transitions: Moving off the grid keeps the agent in its current state.
Reward: -1 for each step, +10 for reaching the goal, and -10 for entering the penalty state.

In [16]:
import numpy as np
import random

grid_size = 5
actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]  # Left, Right, Up, Down
n_actions = len(actions)
goal_state = (4, 4)
penalty_state = (3, 3)
epsilon = 0.1
alpha = 0.5 # Learning rate
gamma = 0.9 # Discount rate
n_episodes = 100



def step(state, action):
    next_state = (max(0, min(grid_size - 1, state[0] + actions[action][0])),
                  max(0, min(grid_size - 1, state[1] + actions[action][1])))
    
    reward = -1
    if next_state == goal_state:
        reward = 10
    elif next_state == penalty_state:
        reward = -10
        
    return next_state, reward


# On-Policy Temporal Difference Control (SARSA)
$Q(S, A) \leftarrow Q(S, A)+\alpha\left[R+\gamma Q\left(S^{\prime}, A^{\prime}\right)-Q(S, A)\right]$

In [14]:
# Initialize Q-table
Q = np.zeros((grid_size, grid_size, n_actions))

def choose_action(state):
    if random.random() < epsilon:
        return random.randint(0, n_actions - 1)
    else:
        return np.argmax(Q[state[0], state[1], :])

# SARSA Algorithm
for episode in range(n_episodes):
    # Random start
    state = (random.randint(0, grid_size - 1), random.randint(0, grid_size - 1))

    while state != goal_state:
        action = choose_action(state)
        next_state, reward = step(state, action)
        next_action = choose_action(next_state)

        #CONTINYA here, make sense of this..
        # SARSA update
        Q[state[0], state[1], action] += alpha * (reward + gamma * Q[next_state[0], next_state[1], next_action] - Q[state[0], state[1], action])

        state = next_state

# Derive policy from learned Q-table
policy = np.zeros((grid_size, grid_size), dtype=int)  # Initialize policy array

for i in range(grid_size):
    for j in range(grid_size):
        if (i,j) == goal_state:
            continue
        best_action = np.argmax(Q[i, j, :])  # Find the best action for this state
        policy[i, j] = best_action  # Update the policy with the best action

# Convert numeric actions in the policy to their corresponding directions for readability
action_names = {0: 'Left', 1: 'Right', 2: 'Up', 3: 'Down'}
policy_readable = np.vectorize(action_names.get)(policy)
policy_readable[goal_state[0], goal_state[1]] = ''

print("Derived Policy:")
print(policy_readable)

#TODO: Derive actual policy from value table.

Derived Policy:
[['Right' 'Down' 'Down' 'Right' 'Down']
 ['Right' 'Down' 'Down' 'Right' 'Down']
 ['Right' 'Down' 'Down' 'Right' 'Down']
 ['Right' 'Down' 'Down' 'Right' 'Down']
 ['Right' 'Right' 'Right' 'Right' '']]


# Off-Policy Temporal Difference Control (Q-learning)

$Q(S, A) \leftarrow Q(S, A)+\alpha\left[R+\gamma \max _a Q\left(S^{\prime}, a\right)-Q(S, A)\right]$

In [17]:
# Initialize Q-table
Q = np.zeros((grid_size, grid_size, n_actions))

def choose_action(state):
    if random.random() < epsilon:
        return random.randint(0, n_actions - 1)
    else:
        return np.argmax(Q[state[0], state[1], :])

# Q-learning Algorithm
for episode in range(n_episodes):
    # Random start
    state = (random.randint(0, grid_size - 1), random.randint(0, grid_size - 1))
    
    while state != goal_state:
        action = choose_action(state)
        next_state, reward = step(state, action)

        # Q-learning update
        Q[state[0], state[1], action] += alpha * (reward + gamma * np.max(Q[next_state[0], next_state[1], :]) - Q[state[0], state[1], action])
        
        state = next_state

# Derive policy from learned Q-table
policy = np.zeros((grid_size, grid_size), dtype=int)  # Initialize policy array

for i in range(grid_size):
    for j in range(grid_size):
        if (i,j) == goal_state:
            continue
        best_action = np.argmax(Q[i, j, :])  # Find the best action for this state
        policy[i, j] = best_action  # Update the policy with the best action

# Convert numeric actions in the policy to their corresponding directions for readability
action_names = {0: 'Left', 1: 'Right', 2: 'Up', 3: 'Down'}
policy_readable = np.vectorize(action_names.get)(policy)
policy_readable[goal_state[0], goal_state[1]] = ''

print("Derived Policy:")
print(policy_readable)

Derived Policy:
[['Right' 'Down' 'Down' 'Down' 'Down']
 ['Right' 'Right' 'Right' 'Down' 'Left']
 ['Down' 'Down' 'Right' 'Right' 'Down']
 ['Down' 'Down' 'Down' 'Down' 'Down']
 ['Right' 'Right' 'Right' 'Right' '']]
