# Reinforcement Learning

In [1]:
import pandas as pd
import numpy as np

In [2]:
# reward matrix
R = np.matrix([ [-1,-1,-1,-1,0,-1],
               [-1,-1,-1,0,-1,100],
               [-1,-1,-1,0,-1,-1],
               [-1,0,0,-1,0,-1],
               [-1,0,0,-1,-1,100],
               [-1,0,-1,-1,0,100] ])

In [3]:
# Q matrix
Q = np.matrix(np.zeros([6,6]))
# initially learning is not there
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [5]:
initial_state = 1
# learning paramter gamma
gamma = 0.8


In [11]:
def available_options(state):
    current_state_row = R[state,]
    all_available_rooms = np.where(current_state_row >= 0)[1]
    return all_available_rooms

available_rooms = available_options(initial_state)
print(available_rooms)



[3 5]


In [12]:
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_actions_range, 1))
    return next_action

# sample next action to be performed OR next room where we need to jump to
action = sample_next_action(available_rooms)
print(action)

5


In [13]:
# this function updates the Q matix according to the path we have selected
#.. and the Q learning algorithm

def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    Q[current_state, action] = R[current_state, action] + gamma*max_value

update(initial_state, action, gamma)
print(Q)

[[  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0. 100.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]]


In [14]:
# train the model over 10,000 times/iterations
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_actions = available_options(current_state)
    action = sample_next_action(available_actions)
    update(current_state, action, gamma)
print(Q)

[[  0.   0.   0.   0. 400.   0.]
 [  0.   0.   0. 320.   0. 500.]
 [  0.   0.   0. 320.   0.   0.]
 [  0. 400. 256.   0. 400.   0.]
 [  0. 400. 256.   0.   0. 500.]
 [  0. 400.   0.   0. 400. 500.]]


In [15]:
# normalizing the trained Q matrix data
Q/np.max(Q)*100

matrix([[  0. ,   0. ,   0. ,   0. ,  80. ,   0. ],
        [  0. ,   0. ,   0. ,  64. ,   0. , 100. ],
        [  0. ,   0. ,   0. ,  64. ,   0. ,   0. ],
        [  0. ,  80. ,  51.2,   0. ,  80. ,   0. ],
        [  0. ,  80. ,  51.2,   0. ,   0. , 100. ],
        [  0. ,  80. ,   0. ,   0. ,  80. , 100. ]])

In [18]:
# in actual problem our current state is room number 2
current_state = 2
steps = [current_state]  # to keep track of our paths that we have taken

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index
print("selected path")
print(steps)
        

selected path
[2, 3, 1, 5]
