In [7]:
import numpy as np
import pylab as plt

R = np.matrix([ [-1,-1,-1,-1,0,-1],
		[-1,-1,-1,0,-1,100],
		[-1,-1,-1,0,-1,-1],
		[-1,0,0,-1,0,-1],
		[0,-1,-1,0,-1,100],
		[-1,0,-1,-1,0,100] ])

# Q matrix
Q = np.matrix(np.zeros([6,6]))

# Gamma (learning parameter).
gamma = 0.8

# Initial state. (Usually to be chosen at random)
initial_state = 0

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state) 

# This function chooses at random which action to be performed within the range 
# of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act,1))
    return next_action

# Sample next action to be performed
action = sample_next_action(available_act)

def update(current_state, action, gamma):
    
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Training

# Train over 10 000 iterations. (Re-iterate the process above).
for i in range(1000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state,action,gamma)
    
# Normalize the "trained" Q matrix
print("Trained Q matrix (Before Normalization):")
print(Q)

print("Trained Q matrix (After Normalization):")
print(Q/np.max(Q)*100)

Trained Q matrix (Before Normalization):
[[  0.           0.           0.           0.         399.96427035
    0.        ]
 [  0.           0.           0.         319.96427035   0.
  499.96427035]
 [  0.           0.           0.         319.97141628   0.
    0.        ]
 [  0.         399.96427035 255.97713302   0.         399.96427035
    0.        ]
 [319.96427035   0.           0.         319.97141628   0.
  499.95533794]
 [  0.         399.96427035   0.           0.         399.96427035
  499.95533794]]


In [8]:
print("Trained Q matrix (After Normalization):")
print(Q/np.max(Q)*100)

Trained Q matrix (After Normalization):
[[  0.           0.           0.           0.          79.99857071
    0.        ]
 [  0.           0.           0.          63.99742728   0.
  100.        ]
 [  0.           0.           0.          63.99885657   0.
    0.        ]
 [  0.          79.99857071  51.19908526   0.          79.99857071
    0.        ]
 [ 63.99742728   0.           0.          63.99885657   0.
   99.99821339]
 [  0.          79.99857071   0.           0.          79.99857071
   99.99821339]]
