In [1]:
import numpy as np
import gym

In [7]:
def value_iteration(env, gamma, stop_callback):
    """Value iteration algorithm is used to find the optimal state values (or action values
    but generally the state values are computed because they need less or the same memory) for 
    a Markov Decision Process where everything is known (the set of states, set of actions, reward 
    function and transition function).
    
    Value iteration uses a Dynamic Programming approach to find the optimal values because the
    formula can be expressed in a recursive way and the subproblems overlap. Thanks to this we
    have an exact method to calculate the optimal values in polynomial complexity. 
    
    However, as many other problems, an exact method is very inefficient for very large problems.
    This and having to know every parameter of the MDP are the weak points of the algorithms based
    on Dynamic Programming for getting an optimal policy (or optimal values that we can use to get 
    the policy).
    
    Another thing to note is that this is not Reinforcement Learning, note that here we know everything
    about the MDP and we are just applying an exact method to "solve" it, we are not learning anything.
    
    Args:
        env: The Markov Decision Process with all parameters known
        gamma: The discount factor.
        stop_callback: Callback to indicate when to stop. It receives the current
            iteration index, a numpy vector of the previous state values and a numpy
            vector of the current state values. It must return a boolean indicating whether
            the algorithm should stop or not (return True to stop, False to continue). This
            way we can stop the algorithm by a maximum number of iterations or when the difference
            between the values from one iteration to the next is so small that we can consider
            that the algorithm converged

    Returns:
        A numpy array with the optimal state values when there are k time steps left
        to finish the game.
    """
    # Helper method:
    # Action-value function. Returns the expected reward to get from
    # the given state and taking the given action. V is a list of the optimal
    # state values for the next time step
    def action_value_function(state, action, V):
        q_value = 0
        for prob, next_state, reward, _ in env.P[state][action]:
            q_value += prob * (reward + gamma * V[next_state])
        return q_value
    
    # Maximum number of iterations to do
    # So the algorithm doesn't run forever
    MAX_K = int(1E9)

    # Initialization:
    # k=0. We have 0 time steps left, meaning that the game
    # is finished and the agent cannot do anything
    # so the value of every state is 0
    # Note that we are formulating the problem this way to use a Dynamic
    # Programming Bottom-up approach, also known as tabulation approach
    # because we can see it as just filling a matrix
    V = np.zeros(env.nS)

    # Now, having the optimal state values when there are k-1 time steps left
    # we can calculate the optimal state values when there are k time steps left.
    # This thanks to using a Dynamic Programming Bottom-up approach
    for k in range(1, MAX_K):
        # Keep a copy of V because we are going to be modifying V and we
        # need to be taking a look at the values of the previous V
        V_prev = np.copy(V)
        
        # Calculate the optimal value for each state when there
        # are k time steps left
        for state in range(env.nS):
            # Calculate all the optimal Q values (action values) for the current state and time step
            q_values = [action_value_function(state, action, V_prev) for action in range(env.nA)]
            
            # The value of the state in the current time step is the maximum of the Q values
            V[state] = max(q_values)

        # Break if we are done
        if (stop_callback(k, V_prev, V)):
            break
            
    return V

In [6]:
env = gym.make('FrozenLake-v0').unwrapped

stop_evaluator = lambda k, V_prev, V: np.sum(np.fabs(V_prev - V)) <= 1E-9
print(value_iteration(env, 1.0, stop_evaluator))

env.close()

[ 0.82352941  0.82352941  0.82352941  0.8235294   0.82352941  0.
  0.52941176  0.          0.82352941  0.82352941  0.76470588  0.          0.
  0.88235294  0.94117647  0.        ]
