In [1]:
# expermiment of using dymic programming to solve arbitrary problem with policy iteration and value iteration
import numpy as  np
NUM_STATES = 5
NUM_ACTIONS = 3

In [3]:
# create sample state transition p(s_, r | s, a)
p = np.random.uniform(0, 1, size=(NUM_STATES, NUM_STATES))
p

array([[0.02615745, 0.78336181, 0.40839532, 0.38801567, 0.58992588],
       [0.72380701, 0.98730037, 0.92682389, 0.63304074, 0.46174731],
       [0.31882491, 0.93421116, 0.86604922, 0.41009437, 0.16493008],
       [0.04466429, 0.12080311, 0.50105751, 0.17546288, 0.72556023],
       [0.69125179, 0.4958132 , 0.26665   , 0.41962378, 0.96816784]])

In [72]:
# in this case p is a single state transition, but in decision making systems there will be multiple state transition probabilities for differnet 
# actions
full_state_tranition = np.random.uniform(0, 1, size=(NUM_ACTIONS, NUM_STATES, NUM_STATES))
full_state_tranition[0][2] = np.zeros(5); full_state_tranition[0][2][2] = 1
full_state_tranition[1][2] = np.zeros(5); full_state_tranition[1][2][2] = 1
full_state_tranition[2][2] = np.zeros(5); full_state_tranition[2][2][2] = 1
full_state_tranition

array([[[0.14339917, 0.41584085, 0.34225596, 0.10932779, 0.21566567],
        [0.45018973, 0.38749522, 0.68570135, 0.27622476, 0.28250701],
        [0.        , 0.        , 1.        , 0.        , 0.        ],
        [0.26025113, 0.03196472, 0.65869642, 0.63776217, 0.20665028],
        [0.87182697, 0.09877816, 0.72491228, 0.11928014, 0.94359884]],

       [[0.58050515, 0.24429769, 0.93237344, 0.06323805, 0.2080298 ],
        [0.48762927, 0.04349498, 0.7261596 , 0.47952975, 0.94153642],
        [0.        , 0.        , 1.        , 0.        , 0.        ],
        [0.41564663, 0.28685354, 0.80810742, 0.78945847, 0.3005638 ],
        [0.89786006, 0.41018395, 0.26677131, 0.9672789 , 0.08991338]],

       [[0.39462617, 0.90554868, 0.30201547, 0.28636145, 0.69818032],
        [0.72546344, 0.01739959, 0.74949358, 0.56533042, 0.78225309],
        [0.        , 0.        , 1.        , 0.        , 0.        ],
        [0.42835978, 0.47301234, 0.63340585, 0.17105352, 0.1391543 ],
        [0.17193

In [103]:
# In this case there is a state transition for every actions
# now lets create the policy
policy = np.random.randint(0, 3, size=(NUM_STATES))
policy # this describes the action we take in each state - this is a deterministic policy

array([1, 2, 0, 2, 0])

In [8]:
stochatic_policy = np.random.uniform(0, 1, size=(NUM_STATES, NUM_ACTIONS))
stochatic_policy

array([[0.0028092 , 0.26338111, 0.02366215],
       [0.03291537, 0.16001783, 0.12883582],
       [0.13194515, 0.9690063 , 0.19958283],
       [0.48585821, 0.45206064, 0.27754835],
       [0.01959283, 0.90004438, 0.46959289]])

In [68]:
# now we will desing the rewards for all the states
rewards = [-1] * 5
rewards[2] = 0
rewards # all positions are zero, except the middle positions which has a reward of zero

[-1, -1, 0, -1, -1]

In [11]:
terminal_state = 2 # we are assingning our terminal state to be 2, we will be using this when experimenting on monte carlo methods

In [13]:
np.random.uniform(0, 1, size=5)

array([0.6327861 , 0.00264247, 0.01055561, 0.22343428, 0.69921155])

In [26]:
discount_factor = 0.9

In [18]:
p = full_state_tranition[0][0]
np.argmax(p)

4

In [51]:
threshold = 1e-2
threshold

0.01

In [83]:
# policy evaluation
def evaluate_policy(policy):
    # randomized value for each state
    state_value = np.random.uniform(0, 1, size=5)
    state_value[2] = 0 # setting state value for terminal state to 0
    max_gap = 0
    while True:
        for state in range(NUM_STATES):
            action = policy[state] # returns the action to be taken
            next_state = np.argmax(full_state_tranition[action][state]) # returns the next state based on the environment dynamics
            reward = rewards[next_state] # gets teh reward of the next state
            old_state = state_value[state]
            state_value[state] = reward + discount_factor * state_value[next_state]
            print("The old state value is :", old_state, "the new state value is :", state_value[state])
            current_gap = np.abs(old_state - state_value[state])
            if current_gap > max_gap:
                max_gap = current_gap
        
        print(max_gap)
        if max_gap <= threshold:
            break 
        else:
            max_gap = 0
    return state_value

In [90]:
policy

array([0, 0, 1, 0, 2])

In [91]:
new_value_funciton = evaluate_policy(policy)
new_value_funciton

array([-1.,  0.,  0.,  0., -1.])

In [79]:
def improve_policy(value_function):
    # act greedily based on the value function to improve the existing policy
    policy = np.arange(NUM_STATES)
    for state in range(NUM_STATES):
        action_states = [] # for each action which state will i be going to
        for action in range(NUM_ACTIONS):
            next_state = np.argmax(full_state_tranition[action][state])
            value_of_next_state = value_function[next_state]
            action_states.append(value_of_next_state) # add value of next stat to actioin states list
        policy[state] = np.argmax(action_states) # select action whose next state value is highest
    return policy

In [92]:
new_policy = improve_policy(new_value_funciton)
new_policy

array([0, 0, 0, 0, 1])

In [101]:
def policy_iteration(policy):
    while True:
        value_function = evaluate_policy(policy)
        old_policy = policy
        policy = improve_policy(value_function)
        if sum(np.array(old_policy) != np.array(policy)) == 0:
            return policy, value_function
        

In [105]:
# this always evaluates to the optimal policy given the environment state dynamics
policy_iteration(policy)

(array([0, 0, 0, 0, 1]), array([-1.,  0.,  0.,  0., -1.]))

In [None]:
def value_iteration():
    pass

In [None]:
def monte_carlo_policy_prediction():
    pass