# HW 06

Implement Value Iteration on MDP

In [7]:
import numpy as np
from typing import List
import copy

In [None]:
### DEFINITIONS

class State:
    """A state (x and y coordinate)"""
    
    def __init__(self, x: int, y: int):
        assert x < GRID_DIM_X and x >= 0, "x out of bounds"
        assert y < GRID_DIM_Y and y >= 0, "y out of bounds"
        self.coord_x = x
        self.coord_y = y
        return
    
    def __eq__(self, other):
        # Check if 'other' is a state
        if isinstance(other, State):
            return self.coord_x == other.coord_x \
                and self.coord_y == other.coord_y
        return False
    
    def __str__(self):
        return f"State(x={self.coord_x}, y={self.coord_y})"

    def __repr__(self):
        return self.__str__()
    
class Action:
    """an action, simply encoded as an integer"""
    # 0 ... stay
    # 1 ... up
    # 2 ... right
    # 3 ... down
    # 4 ... left

    def __init__(self,action: int):
        assert action < 5 and action >= 0, "action out of bounds"
        self.action = action
        return
    
    def __eq__(self, other):
        # Check if 'other' is a state
        if isinstance(other, Action):
            return self.action == other.action
        return False
    
class ValueFunction:
    """Value Function"""

    def __init__(self):
        self._storage = np.zeros((GRID_DIM_X,GRID_DIM_Y)) #access as value_function[state.coord_x, state.coord_y]

    def get_value(self, state: State) -> float:
        return self._storage[state.coord_x, state.coord_y]
    def set_value(self, state: State, value: float):
        self._storage[state.coord_x, state.coord_y] = value
    
def reward_funciton(state: State):
    """reward for being at a certain state"""
    if(TARGET_STATE == state):
        return 10
    return -1

def transition_probability(state_src: State, state_dst: State, action: Action):
    """if being in state src and applying action, what is the probability to end up at state dst
    
    note that illegal states do not need to be checked, as dst needs to be valid state!"""
    # action = stay
    if state_dst == state_src\
        and action.action == 0:
        return 1
    # action is not stay, but still stay
    if state_dst == state_src\
        and action.action != 0:
        return 1 - TRANSITION_PROB_P
    # desired actions
    if (state_dst.coord_x == state_src.coord_x\
        and state_dst.coord_y == state_src.coord_y+1\
        and action.action == 1)\
        or\
        (state_dst.coord_x == state_src.coord_x\
        and state_dst.coord_y == state_src.coord_y-1\
        and action.action == 3)\
        or\
        (state_dst.coord_x == state_src.coord_x-1\
        and state_dst.coord_y == state_src.coord_y\
        and action.action == 4)\
        or\
        (state_dst.coord_x == state_src.coord_x+1\
        and state_dst.coord_y == state_src.coord_y\
        and action.action == 2):
        return TRANSITION_PROB_P
    # everything else
    return 0

def step_vi(state_evaluate: State #where evaluating
            , set_states: List[State] #set of all states
            , set_actions: List[Action] #set of all actions
            , value_function_old: ValueFunction #old value function
            , value_function_new: ValueFunction): #new value function
    """the maximization step with a deterministic policy in VI"""
    maximum = -np.inf
    for action in set_actions:
        sum_nextstep = 0
        for state_next in set_states:
            sum_nextstep += transition_probability(state_src=state_evaluate,
                                                   state_dst=state_next,
                                                   action=action)\
                            *value_function_old.get_value(state_next)
        val_action = reward_funciton(state_evaluate)\
            + DISCOUNT*sum_nextstep
        if val_action > maximum:
            maximum = val_action
    # print(f"Updating {state_evaluate} from {value_function_old.get_value(state_evaluate)} -> {maximum}")
    value_function_new.set_value(state_evaluate, maximum)

def value_iteration(value_function_init: ValueFunction, #initial value function
                    set_states: List[State], #all possible states
                    set_actions: List[Action], #all possible actions
                    epsilon: float = 0.42 #when stopping to converge
                    ) -> ValueFunction:
    """runs VI and returns a (new) value function
    
    the algorithm terminates if the change in the infinty norm of two consequitive value
    functions is < epsilon"""
    step = 0
    value_function_old = value_function_init
    value_function_new = copy.deepcopy(value_function_init)
    while True:
        norm = -np.inf
        for state in set_states:
            step_vi(state_evaluate=state,
                    set_states=set_states,set_actions=set_actions,
                    value_function_old=value_function_old,
                    value_function_new=value_function_new)
            #update norm
            norm_state = np.abs(value_function_new.get_value(state)
                                - value_function_old.get_value(state))
            if norm_state > norm:
                norm = norm_state
        # update value funciton
        value_function_old = copy.deepcopy(value_function_new)
        # check break criterion
        print(f"[{step}]\tLIninity Norm: {norm}")
        if norm < epsilon:
            break
        # update step
        step += 1
    # return found value function
    return value_function_old          
            


### DEFINITIONS

GRID_DIM_X = 4
GRID_DIM_Y = 4

TARGET_STATE = State(x=3,y=3)

DISCOUNT = 0.95

TRANSITION_PROB_P = 0.7 #probability that the transition actually works out

### SCRIPT

set_states = [] #set of all states
for x in range(0,GRID_DIM_X):
    for y in range(0,GRID_DIM_Y):
        set_states.append(State(x=x,y=y))
set_actions = []
for a in range(0,5):
    set_actions.append(Action(a))

value_function = ValueFunction()

value_iteration(value_function_init=value_function
                ,set_states=set_states
                ,set_actions=set_actions
                ,epsilon=0.5)

