In [1]:
import numpy as np

In [2]:
# R = Reward Matrix
R = np.array([[-1, -1, -1, -1, 0, -1],
              [-1, -1, -1, 0, -1, 100],
              [-1, -1, -1, 0, -1, -1],
              [-1, 0, 0, -1, 0, -1],
              [-1, 0, 0, -1, -1, 100],
              [-1, 0, -1, -1, 0, 100]])
R
# -1 indicates the null value
# 0 indicates that there is no direct connection between that two nodes
# 100 indicates that there is direct connection between that nodes

array([[ -1,  -1,  -1,  -1,   0,  -1],
       [ -1,  -1,  -1,   0,  -1, 100],
       [ -1,  -1,  -1,   0,  -1,  -1],
       [ -1,   0,   0,  -1,   0,  -1],
       [ -1,   0,   0,  -1,  -1, 100],
       [ -1,   0,  -1,  -1,   0, 100]])

In [3]:
# Q = Action Value
Q = np.array(np.zeros([6,6]))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

### Defining Hyperparameters

In [4]:
gamma = 0.8 # discount factor
# gamma ranges btn 0 to 1, more the gamma is close to 0 the agent will exploit 
# and more the gamma value is close to 1 the agent will explore

### Getting all available actions for particular state

In [5]:
# This function return all available action in the state given as an argument
def available_actions(state):
    current_state_row = R[state, :]
    av_act = np.where(current_state_row >= 0) [0] # returns a tuple containing the indices where the condition is True.
    # Since we are interested in the column indices, we use [0] to access the first element of the tuple, which contains these indices.
    return av_act

In [6]:
initial_state = 1 # Initial State (Usually to be chosen at random)

available_act = available_actions(initial_state)
print(available_act)
# Getting the available action  means what are the possible path for our agent to go further

[3 5]


### Choosing which action to be performed by agent

In [7]:
# This function chooses at random which action to be performend within the range of all the available actions
def sample_next_action( available_action_range ):
    next_action = int(np.random.choice( available_act )) # it choses a any value at random from that array
    return next_action

In [8]:
# Sample next action to be performed
action = sample_next_action( available_act )
print(action)

3


### Updating Q matrix and Q Learning Algorithm

In [9]:
# This function updates Q matrix and Q Learning Algorithm according to path selected
def update_Q( current_state, action, gamma ):
    max_index = np.where( Q[action, :] == np.max(Q[action, :])) [0]

    if(max_index.shape[0] > 1):
        max_index = int(np.random.choice(max_index))
    else:
        max_index = int(max_index[0])

    max_value = Q[action, max_index]

    # Q Learning Formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

In [10]:
# Update Q matrix
update_Q(initial_state, action, gamma)

## Training

In [11]:
# Training over 10,000 iterations, more-iterations = well trained

for i in range(10000):
    current_state = np.random.randint(0, Q.shape[0])
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update_Q( current_state, action, gamma)

### Normalize the Trained Q matrix

In [12]:
Q

array([[  0.,   0.,   0.,   0., 400.,   0.],
       [  0.,   0.,   0., 320.,   0., 500.],
       [  0.,   0.,   0., 320.,   0.,   0.],
       [  0., 400., 256.,   0., 400.,   0.],
       [  0., 400., 256.,   0.,   0., 500.],
       [  0., 400.,   0.,   0., 400., 500.]])

In [13]:
print("Trained Q matrix:")
print(Q / (np.max(Q) * 100))
# normalizes the Q-values by dividing each value in the Q matrix by 100 times the maximum Q-value in the matrix.
# This scales the Q-values to be between 0 and 1, making them more interpretable and stable for analysis.

Trained Q matrix:
[[0.      0.      0.      0.      0.008   0.     ]
 [0.      0.      0.      0.0064  0.      0.01   ]
 [0.      0.      0.      0.0064  0.      0.     ]
 [0.      0.008   0.00512 0.      0.008   0.     ]
 [0.      0.008   0.00512 0.      0.      0.01   ]
 [0.      0.008   0.      0.      0.008   0.01   ]]


## Testing

In [14]:
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state, :] == np.max(Q[current_state, :]))[0]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index))
    else:
        next_step_index = int(next_step_index[0])

    steps.append(next_step_index)
    
    current_state = next_step_index

### Results

In [15]:
print("Best sequence path:", steps)

Best sequence path: [2, 3, 4, 5]
