# Temporal Difference  Control using  SARSA

In this demo,we estimate the optimal policy starting from a random policy using SARSA algorithm.

In [1]:
import numpy as np
from gridworld import GridWorld
np.set_printoptions(precision=3,suppress=True)

## GridWorld


1.The robot moves one step in the world based on the action given.<br>
2.The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT <br>
3.The environment is stochastic , hence when we say up,it goes up with a specified probability <br>


# Value function Update (State-action-reward-state-action)

Update the  state-action function by looking up the value corresponding to new observation. <br>
When we move from state(st) by taking an action, we get an reward and move to a new state that gives us new observation. We use this to look up the state_action matric (q-value) to compute the error. <br>
The state_action matrix corresponding to previous state-action is updated according to this error.


In [2]:
def update_state_action(state_action_matrix, observation, new_observation, 
                   action, new_action, reward, alpha, gamma):
    '''Return the updated utility matrix
    '''
    #Getting the values of Q at t and at t+1
    col = observation[1] + (observation[0]*4)
    q = state_action_matrix[action ,col]
    col_t1 = new_observation[1] + (new_observation[0]*4)
    q_t1 = state_action_matrix[new_action ,col_t1]
    #Applying the update rule
    state_action_matrix[action ,col] += \
        alpha * (reward + gamma * q_t1 - q)
    return state_action_matrix

Now that we have updated state_actinon matrix, we look up the matrix, find the best action(greedy update) and update our policy to reflect it.

In [3]:
def update_policy(policy_matrix, state_action_matrix, observation):
    col = observation[1] + (observation[0]*4)
    #find  the action with the highest utility
    best_action = np.argmax(state_action_matrix[:, col])
    #Updating the policy
    policy_matrix[observation[0], observation[1]] = best_action
    return policy_matrix

In [4]:
def return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1):
  
    tot_actions = int(np.nanmax(policy_matrix) + 1)
    action = int(policy_matrix[observation[0], observation[1]])
    non_greedy_prob = epsilon / tot_actions
    greedy_prob = 1 - epsilon + non_greedy_prob
    weight_array = np.full((tot_actions), non_greedy_prob)
    weight_array[action] = greedy_prob
    return np.random.choice(tot_actions, 1, p=weight_array)

In [5]:

env = GridWorld(3, 4)

#Define the state matrix
state_matrix = np.zeros((3,4))
state_matrix[0, 3] = 1
state_matrix[1, 3] = 1
state_matrix[1, 1] = -1
print("State Matrix:")
print(state_matrix)

State Matrix:
[[ 0.  0.  0.  1.]
 [ 0. -1.  0.  1.]
 [ 0.  0.  0.  0.]]


In [6]:
#Define the reward matrix
reward_matrix = np.full((3,4), -0.04)
reward_matrix[0, 3] = 1
reward_matrix[1, 3] = -1
print("Reward Matrix:")
print(reward_matrix)

Reward Matrix:
[[-0.04 -0.04 -0.04  1.  ]
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -0.04]]


In [7]:
#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                              [0.1, 0.8, 0.1, 0.0],
                              [0.0, 0.1, 0.8, 0.1],
                              [0.1, 0.0, 0.1, 0.8]])

In [8]:
#Random policy to start with 
policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32)
policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1)
policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states
print("Policy Matrix:")
print(policy_matrix)

Policy Matrix:
[[ 2.  1.  1. -1.]
 [ 0. nan  1. -1.]
 [ 2.  1.  1.  0.]]


In [9]:
env.setStateMatrix(state_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)

In [10]:
state_action_matrix = np.zeros((4,12))
visit_counter_matrix = np.zeros((4,12))
gamma = 0.999
alpha = 0.001 
tot_epoch = 5000
print_epoch = 1000

In [11]:
for epoch in range(tot_epoch):
#Reset and return the first observation
    observation = env.reset(exploring_starts=True)
    for step in range(1000):
        #pdb.set_trace()
        #Take the action from the action matrix
        action = int(policy_matrix[observation[0], observation[1]])
        #Move one step in the environment and get obs,reward and new action
        new_observation, reward, done = env.step(action)
        new_action = int(policy_matrix[new_observation[0], new_observation[1]])
        #Updating the state-action matrix
        state_action_matrix = update_state_action(state_action_matrix, 
                                          observation, new_observation, 
                                          action, new_action, 
                                          reward, alpha, gamma)
        #Updating the policy
        policy_matrix = update_policy(policy_matrix, 
                              state_action_matrix, 
                              observation)
        observation = new_observation
        if done: break

    if(epoch % print_epoch == 0):
        print("")
        print("State-Action matrix after " + str(epoch+1) + " iterations:") 
        print(state_action_matrix)
        print("Policy matrix after " + str(epoch+1) + " iterations:") 
        print(policy_matrix)


#Time to check the utility matrix obtained
print("State-Action matrix after " + str(tot_epoch) + " iterations:")
print(state_action_matrix)
print("Policy matrix after " + str(tot_epoch) + " iterations:")
print(policy_matrix)


State-Action matrix after 1 iterations:
[[ 0.     0.     0.     0.    -0.     0.     0.     0.    -0.     0.
   0.     0.   ]
 [ 0.     0.     0.     0.    -0.     0.    -0.001  0.    -0.    -0.
  -0.     0.   ]
 [-0.     0.     0.     0.    -0.     0.     0.     0.    -0.     0.
   0.     0.   ]
 [ 0.     0.     0.     0.    -0.     0.     0.     0.     0.     0.
   0.     0.   ]]
Policy matrix after 1 iterations:
[[ 0.  1.  1. -1.]
 [ 3. nan  0. -1.]
 [ 3.  0.  0.  0.]]

State-Action matrix after 1001 iterations:
[[-0.007 -0.001  0.     0.    -0.015  0.     0.052  0.    -0.018 -0.017
  -0.012 -0.039]
 [-0.003  0.1    0.536  0.    -0.015  0.    -0.009  0.    -0.018 -0.017
  -0.012 -0.039]
 [-0.007 -0.001  0.     0.    -0.015  0.    -0.009  0.    -0.018 -0.017
  -0.012 -0.039]
 [-0.007 -0.001  0.     0.    -0.015  0.    -0.008  0.    -0.018 -0.017
  -0.012 -0.039]]
Policy matrix after 1001 iterations:
[[ 1.  1.  1. -1.]
 [ 2. nan  0. -1.]
 [ 2.  3.  0.  3.]]

State-Action matrix after