In [1]:
import numpy as np
from collections import deque
import random

In [7]:
# Define the environment
n_states = 16  # Number of states in the grid world
n_actions = 4  # Number of possible actions (up, down, left, right)
goal_state = 15  # Goal state

# Define parameters
learning_rate0 = 0.8 # Initial learning rate
decay = 0.01  # Learning rate decay
discount_factor = 0.95
exploration_probability = 0.2
epochs = 1000

#replay buffer
buffer_size = 100  # Maximum size of the replay buffer
batch_size = 32  # Number of samples to train on per epoch

In [8]:
Q_table = np.zeros((n_states, n_actions))
replay_buffer = deque(maxlen=buffer_size)

In [9]:
# Q-learning algorithm
for epoch in range(epochs):
    current_state = np.random.randint(0, n_states)  # Start from a random state

    while current_state != goal_state:
        # Choose action with epsilon-greedy strategy
        if np.random.rand() < exploration_probability:
            action = np.random.randint(0, n_actions)  # Explore
        else:
            action = np.argmax(Q_table[current_state])  # Exploit

        # Simulate the environment (move to the next state)
        next_state = (current_state + 1) % n_states

        # Define a simple reward function (1 if the goal state is reached, 0 otherwise)
        reward = 1 if next_state == goal_state else 0

        # Store transition in the replay buffer
        replay_buffer.append((current_state, action, reward, next_state))

        # Train using a random batch from the replay buffer
        if len(replay_buffer) >= batch_size:
            batch = random.sample(replay_buffer, batch_size)
            for s, a, r, ns in batch:
                learning_rate = learning_rate0 / (1 + epoch * decay)
                Q_table[s, a] += learning_rate * \
                                 (r + discount_factor * np.max(Q_table[ns]) - Q_table[s, a])

        current_state = next_state  # Move to the next state

In [10]:
# The resulting Q-Table
print("\nQ-Table after Q-Learning:")
print(Q_table)


Q-Table after Q-Learning:
[[0.48767498 0.48096461 0.48764259 0.48767498]
 [0.51334208 0.51334208 0.51334208 0.51334208]
 [0.54035856 0.54036009 0.54036009 0.54036009]
 [0.56880009 0.56880009 0.56880009 0.56880009]
 [0.59873694 0.59873694 0.59873694 0.59873694]
 [0.63024941 0.63024941 0.63024941 0.63024941]
 [0.66342043 0.66342043 0.66342043 0.66342043]
 [0.6983373  0.6983373  0.6983373  0.6983373 ]
 [0.73509189 0.73509189 0.73509189 0.73509189]
 [0.77378094 0.77378094 0.77378094 0.77378094]
 [0.81450625 0.81450625 0.81450625 0.81450625]
 [0.857375   0.857375   0.857375   0.857375  ]
 [0.9025     0.9025     0.9025     0.9025    ]
 [0.95       0.95       0.95       0.95      ]
 [1.         1.         1.         1.        ]
 [0.         0.         0.         0.        ]]


In [11]:
# Optimal policy
optimal_policy = np.argmax(Q_table, axis=1)
print("\nOptimal Policy:")
print(optimal_policy)


Optimal Policy:
[0 0 1 3 0 0 0 0 0 0 0 0 0 0 0 0]
