In [1]:
import numpy as np

In [26]:
# env with rewards
env = np.array([
    [-1, -5, -3],
    [0, 1, 2],
    [-1, -10, 10]
])

actions = [1, 2, 3, 4]
n_actions = len(actions)
index_to_action = {
    1: 'LEFT',
    2: 'RIGHT',
    3: 'UP',
    4: 'DOWN'
}

# Q-Action value array with shape (states, actions)
QA = np.random.uniform(0, 0.1, size=(env.size, n_actions))

QA

array([[0.09173322, 0.04463034, 0.01121587, 0.05069174],
       [0.03571657, 0.00170079, 0.04985916, 0.0545668 ],
       [0.0498742 , 0.07381411, 0.06152406, 0.06771359],
       [0.03912186, 0.03924504, 0.03813193, 0.08617925],
       [0.09878852, 0.03597733, 0.04414279, 0.01804502],
       [0.06426301, 0.0931838 , 0.0876583 , 0.08320833],
       [0.0814824 , 0.00243921, 0.05353578, 0.01827412],
       [0.07546633, 0.04120652, 0.03488119, 0.05430935],
       [0.00102101, 0.07113049, 0.05549247, 0.00860096]])

In [27]:
# left, right, up, down
def get_available_actions_by_state(state_index):
    """Returns binary mask for available actions: [left, right, up, down]"""
    actions_map = {
        0: [0, 1, 0, 1],  # top-left: right, down
        1: [1, 1, 0, 1],  # top-center: left, right, down
        2: [1, 0, 0, 1],  # top-right: left, down
        3: [0, 1, 1, 1],  # middle-left: right, up, down
        4: [1, 1, 1, 1],  # middle-center: all directions
        5: [1, 0, 1, 1],  # middle-right: left, up, down
        6: [0, 1, 1, 0],  # bottom-left: right, up
        7: [1, 1, 1, 0],  # bottom-center: left, right, up
        8: [1, 0, 1, 0],  # bottom-right: left, up
    }
    return np.array(actions_map[state_index])

def state_matrix_to_index(state):
    return state[0] * 3 + state[1]

def index_to_state_matrix(index):
    return [index // 3, index % 3]

def traverse_state_matrix(state, action):
    """Move in the environment based on action"""
    new_state = state.copy()
    if action == 1:    # left
        new_state[1] -= 1
    elif action == 2:  # right
        new_state[1] += 1
    elif action == 3:  # up
        new_state[0] -= 1
    elif action == 4:  # down
        new_state[0] += 1
    return new_state

def epsilon_greedy_action(state_index, epsilon=0.1):
    """Select action using epsilon-greedy strategy"""
    avail_actions = get_available_actions_by_state(state_index)
    valid_actions = [a for i, a in enumerate(actions) if avail_actions[i]]

    if np.random.random() < epsilon:
        # Explore: random action
        return np.random.choice(valid_actions)
    else:
        # Exploit: best action among valid ones
        q_values = QA[state_index, :].copy()
        for i, a in enumerate(actions):
            if not avail_actions[i]:
                q_values[i] = -np.inf  # Mask invalid actions
        best_action_idx = np.argmax(q_values)
        return actions[best_action_idx]

In [28]:
# Zero out Q-values for unavailable actions
for s in range(env.size):
    avail_actions = get_available_actions_by_state(s)
    QA[s, :] = QA[s, :] * avail_actions

QA

array([[0.        , 0.04463034, 0.        , 0.05069174],
       [0.03571657, 0.00170079, 0.        , 0.0545668 ],
       [0.0498742 , 0.        , 0.        , 0.06771359],
       [0.        , 0.03924504, 0.03813193, 0.08617925],
       [0.09878852, 0.03597733, 0.04414279, 0.01804502],
       [0.06426301, 0.        , 0.0876583 , 0.08320833],
       [0.        , 0.00243921, 0.05353578, 0.        ],
       [0.07546633, 0.04120652, 0.03488119, 0.        ],
       [0.00102101, 0.        , 0.05549247, 0.        ]])

In [29]:
# Training parameters
n_steps = 100
n_experiments = 10000
gamma = 0.95  # Discount factor (was y=0.1, too low!)
lr = 0.1     # Learning rate
epsilon = 0.2  # Exploration rate

success_count = 0

for i in range(n_experiments):
    state = [0, 0]  # Start at top-left
    reached_reward = False

    for step in range(n_steps):
        current_state_index = state_matrix_to_index(state)

        # Select action using epsilon-greedy
        chosen_action = epsilon_greedy_action(current_state_index, epsilon)

        # Take action and observe new state and reward
        new_state = traverse_state_matrix(state, chosen_action)
        new_state_index = state_matrix_to_index(new_state)
        reward = env[tuple(new_state)]

        # Q-learning update (CORRECTED)
        Q_target = reward + gamma * np.max(QA[new_state_index, :])
        action_idx = actions.index(chosen_action)
        TD_error = Q_target - QA[current_state_index, action_idx]
        QA[current_state_index, action_idx] += lr * TD_error

        # Move to new state
        state = new_state

        # Check if reached goal
        if reward == 10:
            reached_reward = True
            success_count += 1
            break

    # Print progress
    if (i + 1) % 100 == 0:
        success_rate = success_count / (i + 1) * 100
        print(f"Episode {i+1}: Success rate = {success_rate:.1f}%")

Episode 100: Success rate = 99.0%
Episode 200: Success rate = 99.5%
Episode 300: Success rate = 99.7%
Episode 400: Success rate = 99.8%
Episode 500: Success rate = 99.6%
Episode 600: Success rate = 98.8%
Episode 700: Success rate = 98.4%
Episode 800: Success rate = 97.9%
Episode 900: Success rate = 97.6%
Episode 1000: Success rate = 96.6%
Episode 1100: Success rate = 96.6%
Episode 1200: Success rate = 96.1%
Episode 1300: Success rate = 96.0%
Episode 1400: Success rate = 96.1%
Episode 1500: Success rate = 96.1%
Episode 1600: Success rate = 95.9%
Episode 1700: Success rate = 95.8%
Episode 1800: Success rate = 95.8%
Episode 1900: Success rate = 95.9%
Episode 2000: Success rate = 95.9%
Episode 2100: Success rate = 95.8%
Episode 2200: Success rate = 95.8%
Episode 2300: Success rate = 95.8%
Episode 2400: Success rate = 95.7%
Episode 2500: Success rate = 95.6%
Episode 2600: Success rate = 95.4%
Episode 2700: Success rate = 95.6%
Episode 2800: Success rate = 95.5%
Episode 2900: Success rate = 

In [30]:
QA

array([[0.00000000e+00, 2.32564103e+01, 0.00000000e+00, 2.82564103e+01],
       [2.58435897e+01, 2.57435897e+01, 0.00000000e+00, 2.97435897e+01],
       [2.32564103e+01, 0.00000000e+00, 0.00000000e+00, 3.02564103e+01],
       [0.00000000e+00, 2.97435897e+01, 2.58435897e+01, 2.58435897e+01],
       [2.82564103e+01, 3.02564103e+01, 2.32564103e+01, 1.82564103e+01],
       [2.97435897e+01, 0.00000000e+00, 2.57435897e+01, 1.00527178e+01],
       [0.00000000e+00, 1.82564096e+01, 2.82564103e+01, 0.00000000e+00],
       [2.58435897e+01, 1.00527178e+01, 2.97435897e+01, 0.00000000e+00],
       [1.02101399e-03, 0.00000000e+00, 5.54924660e-02, 0.00000000e+00]])

In [31]:
for r in range(QA.shape[0]):
  print(f"state {r} best action {index_to_action[QA[r, :].argmax() + 1]}")

state 0 best action DOWN
state 1 best action DOWN
state 2 best action DOWN
state 3 best action RIGHT
state 4 best action RIGHT
state 5 best action LEFT
state 6 best action UP
state 7 best action UP
state 8 best action UP
