In [4]:
import numpy as np

# Define the environment
env = np.array([[-1, -1, -1, -1, 0, -1],
                [-1, -1, -1, 0, -1, 100],
                [-1, -1, -1, 0, -1, -1],
                [-1, 0, 0, -1, 0, -1],
                [0, -1, -1, 0, -1, 100],
                [-1, 0, -1, -1, 0, 100]])

# Define the Q-table
Q = np.zeros_like(env)

# Define the hyperparameters
alpha = 0.1
gamma = 0.9
epsilon = 0.1
num_episodes = 100

# Define the epsilon-greedy policy
def epsilon_greedy_policy(state, epsilon):
    if np.random.uniform() < epsilon:
        # Choose a random action
        action = np.random.randint(env.shape[0])
    else:
        # Choose the action with the highest Q-value
        action = np.argmax(Q[state, :])
    return action

# Run the Q-learning algorithm
for episode in range(num_episodes):
    # Reset the environment
    state = np.random.randint(env.shape[0])
    done = False
    
    while not done:
        # Choose an action using the epsilon-greedy policy
        action = epsilon_greedy_policy(state, epsilon)
        
        # Take the action and observe the next state and reward
        next_state = action
        reward = env[state, action]
        
        # Update the Q-value for the current state-action pair
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        # Update the state
        state = next_state
        
        # Check if the episode is done
        if state == 5:
            done = True
    
    # Print the learned Q-table after each episode
    print(f"Q-table after episode {episode+1}:")
    print(Q)


Q-table after episode 1:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 2:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 3:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 4:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 5:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 6:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 7:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 8:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Q-table after episode 9:
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [