In [15]:
import numpy as np

In [35]:
# Define the environment
n_states = 16  # Number of states in the grid world
n_actions = 4  # Number of possible actions (up, down, left, right)
goal_state = 15  # Goal state

# Define parameters
learning_rate0 = 0.8 # Initial learning rate
decay = 0.01  # Learning rate decay
discount_factor = 0.95

exploration_probability = 0.2

epochs = 1000

In [36]:
Q_table = np.zeros((n_states, n_actions))

In [37]:
# Q-learning algorithm
for epoch in range(epochs):
    current_state = np.random.randint(0, n_states)  # Start from a random state

    while current_state != goal_state:
        # Choose action with epsilon-greedy strategy
        if np.random.rand() < exploration_probability:
            action = np.random.randint(0, n_actions)  # Explore
        else:
            action = np.argmax(Q_table[current_state])  # Exploit

        # Simulate the environment (move to the next state)
        next_state = (current_state + 1) % n_states

        learning_rate = learning_rate0 / (1 + epoch * decay)

        # Define a simple reward function (1 if the goal state is reached, 0 otherwise)
        reward = 1 if next_state == goal_state else 0

        # Update Q-value using bellman-equation
        Q_table[current_state, action] += learning_rate * \
                                          (reward + discount_factor *
                                           np.max(Q_table[next_state]) - Q_table[current_state, action])

        current_state = next_state  # Move to the next state

In [38]:
# The resulting Q-Table
print("\nQ-Table after Q-Learning:")
print(Q_table)


Q-Table after Q-Learning:
[[0.48716873 0.13572149 0.28606469 0.28084855]
 [0.33302451 0.31524609 0.26168524 0.51334189]
 [0.39878388 0.44755844 0.48421607 0.54036009]
 [0.41166687 0.44369741 0.53555674 0.56880009]
 [0.59873694 0.52737653 0.57581272 0.59462335]
 [0.63024941 0.58652047 0.53528394 0.61018997]
 [0.64461535 0.66004942 0.61872928 0.66342043]
 [0.6983373  0.69432407 0.6928497  0.69666566]
 [0.73402675 0.73130464 0.73509189 0.73268511]
 [0.77372941 0.77142092 0.76978571 0.77378094]
 [0.81450625 0.81430173 0.81286788 0.81447619]
 [0.857375   0.85702085 0.85719912 0.85561139]
 [0.9025     0.90249317 0.90249245 0.90210924]
 [0.94999747 0.94933171 0.95       0.94994173]
 [0.99999109 1.         0.99999679 0.99993717]
 [0.         0.         0.         0.        ]]


In [39]:
# Optimal policy
optimal_policy = np.argmax(Q_table, axis=1)
print("\nOptimal Policy:")
print(optimal_policy)


Optimal Policy:
[0 3 3 3 0 0 3 0 2 3 0 0 0 2 1 0]
