# MDP (Markov Decision Process)
“Markov” generally means that given the present state, the future and the past are independent<br>
For Markov decision processes, “Markov” means action outcomes depend only on the current state<br>
This is just like search, where the successor function could only depend on the current state (not the history)<br>
MDP are characterized by:
* Set of states S
* Start state s0
* Set of actions A
* Transitions P(s’|s,a) (or T(s,a,s’))
* Rewards R(s,a,s’) (and discount )

In [None]:
import random

In [None]:
#Let us have a gridworld
#ref: Chapter 17, Artificial Intelligence a Modern Approach
#ref: CS188 https://inst.eecs.berkeley.edu/~cs188/fa19/
#ref: https://inst.eecs.berkeley.edu/~cs188/fa19/assets/slides/lec8.pdf
#ref: https://courses.cs.washington.edu/courses/cse473/13au/slides/17-mdp-rl.pdf

#This class will create a 2D grid of row x colums 
#Some of the cells can be disabled by putting it into walls
#cells are addressed just like 2d arrays (r,c)
#There are possibly many terminal states
#terminal states have only one action available: Exit 
#Transision is set by default as 80% action and 20%sideways ( a variable noise is used to control this distribution)
#There is a special end state, (-1,-1), from which NO action is available. This state is used as a final state.

#Actions #just some alias
Up    = 0
Down  = 1
Right = 2
Left  = 3
Exit  = 4

class GridWorld :
    #Default is as given in the AIMA book
    def __init__(self, 
                 rows    =3, 
                 columns =4, 
                 walls   =[(1,1)], terminals= {(0,3):+1.0, (1,3):-1.0}, 
                 gamma   =1.0, 
                 living_reward=0,
                 noise   =0.2
                ) :
        """We dont expect these parameters to change during the agent run"""
        self.rows      = rows
        self.columns   = columns
        self.N         = rows * columns #total cells
        self.walls     = walls
        self.terminals = terminals #dictionary of terminal celss and their rewards.
        self.gamma     = gamma
        self.living_reward = living_reward
        self.all_actions   = [ Up, Down, Right, Left, Exit ]
        self.end_state     = (-1, -1) #a dummy state to reach after taking Exit
        self.all_states    = [(r,c) for r in range(rows) for c in range(columns) if (r,c) not in walls ] + [self.end_state]
        self.noise         = noise
        
        
        #transitions from each state and the probabilities
        self.noise                = noise
        self.action_transitions   = { 
            Up:   ([Up,    Left, Right], [1-noise, noise/2, noise/2 ]),
            Down: ([Down,  Left, Right], [1-noise, noise/2, noise/2 ]),
            Left: ([Left,  Up,   Down ], [1-noise, noise/2, noise/2 ]),
            Right:([Right, Up,   Down ], [1-noise, noise/2, noise/2 ]),
            Exit :([Exit], [1.0])
        }
    
    def actions(self, state) :
        """returns all valid actions from the current state"""
        if state in self.terminals :
            return [Exit]
        if state == self.end_state :
            return [] #No action available.
        return [ Up, Down, Right, Left ]
    
    def reward(self, state, action, next_state=None) :
        """reward is the instantaneous reward. It is usually R(s,a,s')"""
        #In grid world the reward depends only on state.
        if state in self.terminals :
            return self.terminals[state] #dict has the terminal values +1 or -1
        if state == self.end_state :
            return 0.0
        return self.living_reward        #usually a small -ve value
    
    def transitions(self, state, action) :
        """return a list of tuple(nextstate, action, probability)"""
        actual_actions, probs = self.action_transitions[action]
        return [ self._next_cell(state, a) for a in actual_actions ], actual_actions, probs
    
    def move(self, state, action) :
        """Take the action and return the tuple(new_state, reward, is_terminal)"""                          
        assert action in self.actions(state) #just a check if this is a valid action at this time or not
        
        cells, actions, p = self.transitions(state, action)
        
        #we choose one cell acccording to probabilities
        new_state   = random.choices(cells, weights=p)[0] #only one; we take index 0                
        reward      = self.reward(state, action) #
        
        is_terminal = False
        if new_state == self.end_state :
            is_terminal = True
            
        return new_state, reward, is_terminal #keep the same for mat as OpenAI gym.
    
    def _next_cell(self, state, action) : 
        """Blindly takes the action without checking anything and returns the position"""
        r,c = state #row & column
        if action == Exit :
            return self.end_state
        if action == Up :
            target = r-1, c  
        if action == Down :
            target = r+1, c
        if action == Right :
            target = r, c+1  
        if action == Left :
            target = r, c-1 
        
        if self._valid_cell(target) :
            return target
        return state #stay put the target is invalid.
    
    def _valid_cell(self, cell) :
        """Returns true if the cell is a valid cell"""
        r, c = cell #this may be an illegal node; we need to check
        
        #is it any of the walls?
        if (r,c) in self.walls :
            return False
        
        #is it outside the grid?
        if r < 0 or r >= self.rows or c < 0 or c >= self.columns :
            return False
        
        return True
    
    #pretty print the grid and agent if given.
    def print(self, agent_state=None) :
        for r in range(self.rows) :
            for c in range(self.columns) :
                cell = (r,c)
                if cell in self.walls :
                    print('# ', end='')
                elif cell in self.terminals :
                    if self.terminals[cell] > 0 :
                        print('+', end=' ')
                    else :
                        print('-', end=' ')
                elif cell == agent_state :
                    print('@ ', end='')
                else :
                    print('. ', end='')
            print("")


In [None]:
#This is a simple class to hold the policy dictionary
#useful for printing the policy and hiding some details.

class Policy :
    def __init__(self, grid_world=None) :
        """Holds one policy and returns actions according to it"""
        self.grid_world = grid_world
        self.policy     = { } #{ state: policy_action}
        
    def __getitem__(self, state) :
        return self.policy[state]
    
    def __setitem__(self, state, action) :
        self.policy[state] = action
    
    
    #Just a pretty print function for easy debugging
    def print(self) :
        print_chars = {Up:'^', Down:'v', Right:'>', Left:'<', Exit:'+'}
        for state in [(r,c) for r in range(self.grid_world.rows) for c in range(self.grid_world.columns)]:
            
            if state in self.grid_world.terminals :
                if self.grid_world.terminals[state] >= 0 :
                    print('+', end=' ') #positive reward terminal
                else :
                    print('-', end=' ') #-ve reward terminal
                    
            elif state not in self.policy :
                print('#', end=' ') #walls
            else :
                print(print_chars[self.policy[state]], end=' ') #directions >, <, ^, v
                
            if (state[1]+1) % self.grid_world.columns == 0 :
                print("") #just a newline

In [None]:
gw = GridWorld(gamma=0.01, living_reward=-0.04)

## Value Iteration
* The value (utility) of a state s:
V*(s) = expected utility starting in s and acting optimally

In [None]:
value_dict = {state: 0 for state in gw.all_states}
def expected_value(gw, s, a, value_dict):
    '''expected value in state N+1 knowing state N and intented move'''
    exp_value = 0
    states, actions, probs = gw.transitions(s, a)
    for i, state in enumerate(states):
        exp_value += probs[i] * (gw.reward(s, a) + value_dict[state])
    return exp_value
expected_value(gw, (0,3), Exit, value_dict)

In [None]:
def value_iteration(gw):
    '''returns the dictionary containing the values for each state after 100 iterations'''
    value_dict = {state: 0 for state in gw.all_states}
    N = 0
    while N < 100:
        for state in gw.all_states:
            if state != (-1,-1):
                list_values = []
                for action in gw.actions(state):
                    list_values.append(expected_value(gw, state, action, value_dict))
                value_dict[state] = max(list_values)
        N += 1
    return value_dict

In [None]:
V = value_iteration(gw)
V

In [None]:
#policy extraction
def policy_from_values(gw, value_dict, actions = Policy(gw)):
    '''i.e. optimal action for each state that maximizes utility given a reward. 
    Replaces the optimal plan (sequence of actions) in deterministic search (= without probabilistic noise) '''
    for state in gw.all_states:
        if state != (-1,-1):
            list_values = []
            for action in gw.actions(state):
                list_values.append((expected_value(gw, state, action, value_dict), action))
            max_value, best_action = max(list_values) 
        actions[state] = best_action # np.argmax(list_values) if using numpy
    return actions

In [None]:
a = policy_from_values(gw, V)
start_state = (2, 0)
rewards = 0
end = False
while end == 0:
    next_state, reward, end = gw.move(start_state, a[start_state])
    rewards += reward
    start_state = next_state
    print(next_state)
print(rewards)

## Q-Value Iteration
To avoid relooping through all the states, implement q-value iteration (q-value = exp_value(s, a, V(s')))<br>
Very similar to value iteration<br>
* The value (utility) of a state s:
V*(s) = expected utility starting in s and acting optimally
* The value (utility) of a q-state (s,a):
Q*(s,a) = expected utility starting out having taken action a from state s and (thereafter) acting optimally


In [None]:
q_value_dict = {(state, action): 0 for state in gw.all_states for action in gw.actions(state)}
def expected_q_value(gw, s, q_value_dict):
    '''returns the value for all the possible expected states given a current state '''
    exp_q_value = {}
    for a in gw.actions(s):
        noise = gw.reward(s, a) + q_value_dict[(s, a)]
        exp_q_value[a] = max([sublist * noise for sublist in gw.transitions(s,a)[2]])
        q_value_dict[(s,a)] = exp_q_value[a]
    return exp_q_value
print(expected_q_value(gw, (2,0), q_value_dict))

In [None]:
#q-value iteration
def q_value_iteration(gw):
    '''returns the dictionary containing the values for each combination of states and actions'''
    value_dict = {state: 0 for state in gw.all_states}
    N = 0
    while N < 100:
        for state in gw.all_states:
            if state != (-1,-1):
                list_values = expected_q_value(gw, state, q_value_dict)
                for action in gw.actions(state):
                    q_value_dict[(state, action)] += list_values[action]
        N += 1
    return q_value_dict

In [None]:
#policy extraction
# def policy_from_q_values(gw, q_value_dict, actions = Policy(gw)):
#     for state in gw.all_states:
#         if state != (-1,-1):
#             list_values = []
#             list_values.append(expected_q_value(gw, state, q_value_dict))
#             max_value, best_action = max(list_values)
#         actions[state] = best_action
#     return actions

## Policy Iteration

In [None]:
#Policy iteration: more efficient since policy can be figured much before the values converge
#1. Policy evaluation: calculate utilities for some fixed policy until convergence.
#2. Policy improvement: update policy using one-step look-ahead with resulting converged (but not optimal) 
#utilities as future values
#Repeat 1 and 2 until convergence
#policy_value(s) = expected total discounted rewards starting in s and following that policy

def values_from_policy(gw, actions, value_dict):
    list_values = []
    for state in gw.all_states:
        if state != (-1,-1):
            value_dict[state] = (expected_value(gw, state, actions[state], value_dict))
    return value_dict

In [None]:
actions = Policy(gw)
for state in gw.all_states:
    if state != (-1,-1):
        actions[state] = gw.actions(state)[0]
value_dict = {state: 0 for state in gw.all_states}
values_from_policy(gw, actions, value_dict)

In [None]:
def policy_iteration(gw):
    value_dict = {state: 0 for state in gw.all_states}
    actions = Policy(gw)
    for state in gw.all_states:
        if state != (-1,-1):
            actions[state] = gw.actions(state)[0]
    N = 0
    M = 0
    while M < 100:
        while N < 5:
            values_check = value_dict.copy()
            values = values_from_policy(gw, actions, value_dict)
            N += 1
        if values == values_check:
            print(M, N)
            break
        actions = policy_from_values(gw, values)
        M +=1
        N = 0
    return actions, values
c, values = policy_iteration(gw)
c.print()
values