In [1]:
import numpy as np
import gym

In [4]:
def value_iteration(env, gamma, max_k):
    # Helper method:
    # Action-value function. Returns the expected reward to get from
    # the given state and taking the given action. V is a list of the optimal
    # state values for the next time step
    def action_value_function(state, action, V):
        q_value = 0
        for prob, next_state, reward, _ in env.P[state][action]:
            q_value += prob * (reward + gamma * V[next_state])
        return q_value

    # Initialization:
    # k=0. We have 0 time steps left, meaning that the game
    # is finished and the agent cannot do anything
    # so the value of every state is 0
    V = np.zeros(env.nS)

    # Now, having the optimal state values when there are k-1 time steps left
    # we can calculate the optimal state values when there are k time steps left.
    # This thanks to using a Dynamic Programming Bottom-up approach
    for k in range(1, max_k):
        # Keep a copy of V because we are going to be modifying V and we
        # need to be taking a look at the values of the previous V
        V_prev = np.copy(V)
        
        # Calculate the optimal value for each state when there
        # are k time steps left
        for state in range(env.nS):
            # Calculate all the optimal Q values (action values) for the current state and time step
            q_values = [action_value_function(state, action, V_prev) for action in range(env.nA)]
            
            # The value of the state in the current time step is the maximum of the Q values
            V[state] = max(q_values)

    return V

In [11]:
env = gym.make('FrozenLake-v0').unwrapped
print(value_iteration(env, 1.0, 1000))
env.close()

[ 0.82352941  0.82352941  0.82352941  0.82352941  0.82352941  0.
  0.52941176  0.          0.82352941  0.82352941  0.76470588  0.          0.
  0.88235294  0.94117647  0.        ]
