## Markov Decision Processes (MDP)

### Solving MDP : Electric Cart
![electric-cart-MDP](figs/electric_cart_0.png)


In [65]:
#The input parameters
a_val = 5
b_val = 5
alpha = 0.2
beta = 0.8
gamma = 0.9


In [66]:
#state space
S = {'LOW_CHARGE':0, 'HIGH_CHARGE':1}
#action space
A = {'IDLE':0, 'FERRY':1, 'RECHARGE':2}


In [67]:
def step(s,a):
	#returns probability of transition to state s1 with a reward r given agent is at state s taking action a
	#as a tuple (p(s',r|s,a), r, s')
	if s==S['LOW_CHARGE'] and a == A['IDLE']:
		return [(0.0,0,S['HIGH_CHARGE']), (1.0,a_val,S['LOW_CHARGE'])]
	elif s==S['LOW_CHARGE'] and a==A['FERRY']:
		return [(1-alpha, -10, S['HIGH_CHARGE']), (alpha, b_val, S['LOW_CHARGE'])]
	elif s==S['LOW_CHARGE'] and a==A['RECHARGE']:
		return [ (1.0,0.0,S['HIGH_CHARGE']), (0,0,S['LOW_CHARGE'])]
	elif s==S['HIGH_CHARGE'] and a==A['IDLE']:
		return [(1.0, a_val, S['HIGH_CHARGE']), (0,0,S['LOW_CHARGE'])]
	elif s==S['HIGH_CHARGE'] and a==A['FERRY']:
		return [(beta, b_val, S['HIGH_CHARGE']),(1-beta,b,S['LOW_CHARGE'])] 
	elif s==S['HIGH_CHARGE'] and a==A['RECHARGE']:
		return [(0.0, 0.0, S['HIGH_CHARGE']),(0,0,S['LOW_CHARGE'])]
		

### Optimal Value function
$v^*(s) = \underset{a}{\text{max}} \quad \sum_{s',r}p(s',r|s,a)[ r + \gamma v^*(s')]$

In [71]:
import numpy as np


In [72]:
#initialization
v = np.zeros((2,1)) #v[LOW_CHARGE] and v[HIGH_CHARGE]
v_old = np.zeros((2,1))

In [73]:
#off-policy iterations
eps=1e-6
#repeat until convergence
while True:
	for state_name,s in S.items():
		for action_name,a in A.items():
			v_current = 0
			for x in step(s,a): #loop through all possible steps given action a from state s
				p,r,s1 = x
				v_current += p*(r + gamma*v[s1])
			if v_current > v[s]:
				print(f'v[{state_name}]:{v[s]} updated to {v_current}')
				v[s] = v_current
			
	if np.sqrt(np.sum((v-v_old)**2)) < eps: #convergence criteria
		break 
	#move on to next iteration
	v_old = v
		
		
	

v[LOW_CHARGE]:[0.] updated to [5.]
v[HIGH_CHARGE]:[0.] updated to [5.]
v[HIGH_CHARGE]:[5.] updated to [9.5]
v[LOW_CHARGE]:[5.] updated to [9.5]
v[HIGH_CHARGE]:[9.5] updated to [13.55]
v[HIGH_CHARGE]:[13.55] updated to [16.466]


In [74]:
#optimal v
for state_name,s in S.items():
	print(f'v*({state_name}) = {v[s]}')


v*(LOW_CHARGE) = [9.5]
v*(HIGH_CHARGE) = [16.466]
