In [12]:
import numpy as np
from collections import deque

In [13]:
# Define the environment
grid_size = 4
n_states = grid_size * grid_size
n_actions = 4
goal_state = 15

actions = {'up': 0, 'down': 1, 'left': 2, 'right': 3}

# Define parameters
learning_rate0 = 0.8  # Initial learning rate
decay = 0.01  # Learning rate decay
discount_factor = 0.95
exploration_probability = 0.2
epochs = 1000

#replay buffer
# buffer_size = 100  # Maximum size of the replay buffer
# batch_size = 32  # Number of samples to train on per epoch

In [14]:
def get_next_state(state, action):
    row, col = divmod(state, grid_size)
    if action == actions['up'] and row > 0:
        row -= 1
    elif action == actions['down'] and row < grid_size - 1:
        row += 1
    elif action == actions['left'] and col > 0:
        col -= 1
    elif action == actions['right'] and col < grid_size - 1:
        col += 1
    return row * grid_size + col

In [15]:
Q_table = np.zeros((n_states, n_actions))
# replay_buffer = deque(maxlen=buffer_size)

In [16]:
# Q-learning algorithm
for epoch in range(epochs):
    current_state = np.random.randint(0, n_states)  # Start from a random state

    while current_state != goal_state:
        # Choose action with epsilon-greedy strategy
        if np.random.rand() < exploration_probability:
            action = np.random.randint(0, n_actions)  # Explore
        else:
            action = np.argmax(Q_table[current_state])  # Exploit

        # Simulate the environment (move to the next state)
        next_state = get_next_state(current_state, action)

        # Define a simple reward function (1 if the goal state is reached, 0 otherwise)
        reward = 1 if next_state == goal_state else -0.1

        # Store transition in the replay buffer
        # replay_buffer.append((current_state, action, reward, next_state))

        # Train using a random batch from the replay buffer
        # if len(replay_buffer) >= batch_size:
        #     batch = random.sample(replay_buffer, batch_size)
        #     for s, a, r, ns in batch:
        #         learning_rate = learning_rate0 / (1 + epoch * decay)
        #         Q_table[s, a] += learning_rate * \
        #                          (r + discount_factor * np.max(Q_table[ns]) - Q_table[s, a])

        # Update Q-value
        learning_rate = learning_rate0 / (1 + epoch * decay)
        Q_table[current_state, action] += learning_rate * \
                                          (reward + discount_factor *
                                           np.max(Q_table[next_state]) - Q_table[current_state, action])

        current_state = next_state  # Move to the next state

In [17]:
print("\nQ-Table after Q-Learning:")
print(Q_table)


Q-Table after Q-Learning:
[[-0.21824202  0.32133098 -0.19265715  0.14791892]
 [-0.14707673  0.1327206  -0.01980184  0.44351592]
 [ 0.27204183  0.572125    0.14646395  0.49969753]
 [ 0.26505736  0.7075      0.3806348   0.1289438 ]
 [ 0.06134118  0.44351875  0.20132995  0.01050313]
 [-0.04144797  0.4389111   0.16609809  0.572125  ]
 [ 0.37261831  0.65983416  0.4392592   0.7075    ]
 [ 0.57084391  0.85        0.57143296  0.68789369]
 [ 0.15979782  0.38240503  0.38143624  0.572125  ]
 [ 0.40625607  0.7075      0.34861786  0.65396593]
 [ 0.44971898  0.73919527  0.45023086  0.85      ]
 [ 0.7070456   1.          0.70747959  0.84991162]
 [ 0.16443156  0.27118419  0.45782219  0.70749998]
 [ 0.54086903  0.68499323  0.5690151   0.85      ]
 [ 0.70129622  0.84712669  0.69754028  1.        ]
 [ 0.          0.          0.          0.        ]]


In [20]:
optimal_policy = np.argmax(Q_table, axis=1)
print("\nOptimal Policy:")
print(optimal_policy)


Optimal Policy:
[1 3 1 1 1 3 3 1 3 1 3 1 3 3 3 0]
