In [2]:
import numpy as np

# Define MDP parameters
num_states = 2
num_actions = 2
gamma = 0.8
alpha = 0.5
num_steps = 200

# Initialize Q-table with zeros
Q = np.zeros((num_states, num_actions))

# Initial state
current_state = 0  # State A

# Run Q-learning for 200 steps
for step in range(num_steps):
    # Choose action based on greedy behavior policy
    current_action = np.argmax(Q[current_state])

    # If there is a tie, prefer move
    if np.sum(Q[current_state] == Q[current_state, current_action]) > 1:
        current_action = 0  # Move

    # Perform the selected action and observe the next state and reward
    if current_action == 0:  # Move
        next_state = 1 - current_state  # Switch state
        reward = 0
    else:  # Stay
        next_state = current_state
        reward = 1

    # Update Q-value
    Q[current_state, current_action] = (1 - alpha) * Q[current_state, current_action] + alpha * (
        reward + gamma * np.max(Q[next_state])
    )

    # Move to the next state
    current_state = next_state

# Display the final action-value table in the format: state, action
print("Final Q-table:")
for state in range(num_states):
    for action in range(num_actions):
        print(f"Q({chr(65 + state)}, {['Move', 'Stay'][action]}): {Q[state, action]}")



Final Q-table:
Q(A, Move): 0.0
Q(A, Stay): 0.0
Q(B, Move): 0.0
Q(B, Stay): 0.0


In [8]:
import numpy as np

# Define MDP parameters
num_states = 2
num_actions = 2
gamma = 0.8
alpha = 0.5
epsilon = 0.5  # Epsilon for ε-greedy policy
num_steps = 200

# Initialize Q-table with zeros
Q = np.zeros((num_states, num_actions))

# Initial state
current_state = 0  # State A

# Run Q-learning for 200 steps with ε-greedy policy
for step in range(num_steps):
    # Choose action based on ε-greedy behavior policy
    if np.random.rand() < epsilon:
        current_action = np.random.choice(num_actions)  # Uniformly choose between move and stay
    else:
        current_action = np.argmax(Q[current_state])

    # If there is a tie, break ties arbitrarily
    if np.sum(Q[current_state] == Q[current_state, current_action]) > 1:
        current_action = np.random.choice(num_actions)

    # Perform the selected action and observe the next state and reward
    if current_action == 0:  # Move
        next_state = 1 - current_state  # Switch state
        reward = 0
    else:  # Stay
        next_state = current_state
        reward = 1

    # Update Q-value
    Q[current_state, current_action] = (1 - alpha) * Q[current_state, current_action] + alpha * (
        reward + gamma * np.max(Q[next_state])
    )

    # Move to the next state
    current_state = next_state

# Display the final action-value table in the format: state, action
print("Final Q-table:")
for state in range(num_states):
    for action in range(num_actions):
        print(f"Q({chr(65 + state)}, {['Move', 'Stay'][action]}): {Q[state, action]}")


Final Q-table:
Q(A, Move): 3.998529314668884
Q(A, Stay): 4.999016864747624
Q(B, Move): 3.997546494916402
Q(B, Stay): 4.998786252774844
