Itâ€™s a 1D gridworld:

Positions: 0 1 2 3 4
Start at state 0
Goal at state 4
Actions: 0 = left, 1 = right
Reward: +10 on reaching goal, -1 otherwise
Episode ends when you reach state 4 or exceed max steps.

In [1]:
import numpy as np
import random

# Simple 1D environment: states = [0, 1, 2, 3, 4], goal = 4
N_STATES = 5
ACTIONS = [0, 1]  # 0 = left, 1 = right

def step(state, action):
    """
    Takes a step in the environment.
    Returns: next_state, reward, done
    """
    if action == 0:  # left
        next_state = max(0, state - 1)
    else:            # right
        next_state = min(N_STATES - 1, state + 1)

    # Reward structure
    if next_state == N_STATES - 1:
        return next_state, 10, True   # goal reached
    else:
        return next_state, -1, False  # small penalty to encourage faster goal


# Q-learning parameters
alpha = 0.1      # learning rate
gamma = 0.9      # discount factor
epsilon = 0.2    # exploration rate (epsilon-greedy)

n_episodes = 500

# Q-table: rows = states, cols = actions
Q = np.zeros((N_STATES, len(ACTIONS)))

def choose_action(state, epsilon):
    if random.random() < epsilon:
        # explore
        return random.choice(ACTIONS)
    else:
        # exploit
        return np.argmax(Q[state, :])

# Training loop
for episode in range(n_episodes):
    state = 0  # start every episode from state 0
    done = False

    while not done:
        action = choose_action(state, epsilon)

        next_state, reward, done = step(state, action)

        # Q-learning update
        best_next_action = np.argmax(Q[next_state, :])
        td_target = reward + gamma * Q[next_state, best_next_action]
        td_error = td_target - Q[state, action]
        Q[state, action] += alpha * td_error

        state = next_state

# After training: show learned Q-values and greedy policy
print("Learned Q-table:")
print(Q)

print("\nGreedy policy (0=left, 1=right) per state:")
policy = np.argmax(Q, axis=1)
for s in range(N_STATES):
    print(f"State {s}: action {policy[s]}")


Learned Q-table:
[[ 3.12084494  4.58      ]
 [ 3.12106171  6.2       ]
 [ 4.57825262  8.        ]
 [ 6.18069978 10.        ]
 [ 0.          0.        ]]

Greedy policy (0=left, 1=right) per state:
State 0: action 1
State 1: action 1
State 2: action 1
State 3: action 1
State 4: action 0
