In [1]:
import numpy as np

In [2]:
# env with rewards
env = np.array([
    [-1, -5, -3],
    [0, 1, 2],
    [-1, -10, 10]
])

actions = [1, 2, 3, 4]
n_actions = len(actions)
index_to_action = {
    1: 'LEFT',
    2: 'RIGHT',
    3: 'UP',
    4: 'DOWN'
}

# Q-Action value array with shape (states, actions)
QA = np.random.uniform(0, 0.1, size=(env.size, n_actions))

QA

array([[0.08749511, 0.03338182, 0.04668114, 0.07879729],
       [0.05759238, 0.09007621, 0.07509596, 0.08498091],
       [0.04076369, 0.0559392 , 0.01037262, 0.09845774],
       [0.08838206, 0.09717103, 0.08402617, 0.02310035],
       [0.00819595, 0.07140293, 0.00141551, 0.02612242],
       [0.0996489 , 0.00659592, 0.04484796, 0.09811831],
       [0.00352745, 0.05508589, 0.005409  , 0.06128077],
       [0.04232811, 0.03721199, 0.08735546, 0.07339779],
       [0.04817543, 0.06010971, 0.08577752, 0.05875158]])

In [3]:
# left, right, up, down
def get_available_actions_by_state(state_index):
    """Returns binary mask for available actions: [left, right, up, down]"""
    actions_map = {
        0: [0, 1, 0, 1],  # top-left: right, down
        1: [1, 1, 0, 1],  # top-center: left, right, down
        2: [1, 0, 0, 1],  # top-right: left, down
        3: [0, 1, 1, 1],  # middle-left: right, up, down
        4: [1, 1, 1, 1],  # middle-center: all directions
        5: [1, 0, 1, 1],  # middle-right: left, up, down
        6: [0, 1, 1, 0],  # bottom-left: right, up
        7: [1, 1, 1, 0],  # bottom-center: left, right, up
        8: [1, 0, 1, 0],  # bottom-right: left, up
    }
    return np.array(actions_map[state_index])

def state_matrix_to_index(state):
    return state[0] * 3 + state[1]

def index_to_state_matrix(index):
    return [index // 3, index % 3]

def traverse_state_matrix(state, action):
    """Move in the environment based on action"""
    new_state = state.copy()
    if action == 1:    # left
        new_state[1] -= 1
    elif action == 2:  # right
        new_state[1] += 1
    elif action == 3:  # up
        new_state[0] -= 1
    elif action == 4:  # down
        new_state[0] += 1
    return new_state

def epsilon_greedy_action(state_index, epsilon=0.1):
    """Select action using epsilon-greedy strategy"""
    avail_actions = get_available_actions_by_state(state_index)
    valid_actions = [a for i, a in enumerate(actions) if avail_actions[i]]

    if np.random.random() < epsilon:
        # Explore: random action
        return np.random.choice(valid_actions)
    else:
        # Exploit: best action among valid ones
        q_values = QA[state_index, :].copy()
        for i, a in enumerate(actions):
            if not avail_actions[i]:
                q_values[i] = -np.inf  # Mask invalid actions
        best_action_idx = np.argmax(q_values)
        return actions[best_action_idx]

In [4]:
# Zero out Q-values for unavailable actions
for s in range(env.size):
    avail_actions = get_available_actions_by_state(s)
    QA[s, :] = QA[s, :] * avail_actions

QA

array([[0.        , 0.03338182, 0.        , 0.07879729],
       [0.05759238, 0.09007621, 0.        , 0.08498091],
       [0.04076369, 0.        , 0.        , 0.09845774],
       [0.        , 0.09717103, 0.08402617, 0.02310035],
       [0.00819595, 0.07140293, 0.00141551, 0.02612242],
       [0.0996489 , 0.        , 0.04484796, 0.09811831],
       [0.        , 0.05508589, 0.005409  , 0.        ],
       [0.04232811, 0.03721199, 0.08735546, 0.        ],
       [0.04817543, 0.        , 0.08577752, 0.        ]])

In [9]:
# Training parameters
n_steps = 100
n_experiments = 1000
gamma = 0.9  # Discount factor (was y=0.1, too low!)
lr = 0.1     # Learning rate
epsilon = 0.1  # Exploration rate

success_count = 0

for i in range(n_experiments):
    state = [0, 0]  # Start at top-left
    reached_reward = False

    for step in range(n_steps):
        current_state_index = state_matrix_to_index(state)

        # Select action using epsilon-greedy
        chosen_action = epsilon_greedy_action(current_state_index, epsilon)

        # Take action and observe new state and reward
        new_state = traverse_state_matrix(state, chosen_action)
        new_state_index = state_matrix_to_index(new_state)
        reward = env[tuple(new_state)]

        # Q-learning update (CORRECTED)
        Q_target = reward + gamma * np.max(QA[new_state_index, :])
        action_idx = actions.index(chosen_action)
        TD_error = Q_target - QA[current_state_index, action_idx]
        QA[current_state_index, action_idx] += lr * TD_error

        # Move to new state
        state = new_state

        # Check if reached goal
        if reward == 10:
            reached_reward = True
            success_count += 1
            break

    # Print progress
    if (i + 1) % 100 == 0:
        success_rate = success_count / (i + 1) * 100
        print(f"Episode {i+1}: Success rate = {success_rate:.1f}%")

Episode 100: Success rate = 79.0%
Episode 200: Success rate = 81.5%
Episode 300: Success rate = 80.0%
Episode 400: Success rate = 79.0%
Episode 500: Success rate = 79.0%
Episode 600: Success rate = 78.8%
Episode 700: Success rate = 78.9%
Episode 800: Success rate = 78.1%
Episode 900: Success rate = 78.2%
Episode 1000: Success rate = 78.9%


In [10]:
for r in range(QA.shape[0]):
  print(f"state {r} best action {index_to_action[QA[r, :].argmax() + 1]}")

state 0 best action DOWN
state 1 best action DOWN
state 2 best action DOWN
state 3 best action RIGHT
state 4 best action RIGHT
state 5 best action LEFT
state 6 best action UP
state 7 best action UP
state 8 best action UP
