In [4]:
import numpy as np
import gym
import time
from lake_envs import *

np.set_printoptions(precision=3)

In [5]:
def render_single(env, policy, max_steps=100):
  """
    This function does not need to be modified
    Renders policy once on environment. Watch your agent play!

    Parameters
    ----------
    env: gym.core.Environment
      Environment to play on. Must have nS, nA, and P as
      attributes.
    Policy: np.array of shape [env.nS]
      The action to take at a given state
  """

  episode_reward = 0
  ob = env.reset()
  for t in range(max_steps):
    env.render()
    time.sleep(0.25)
    a = policy[ob]
    ob, rew, done, _ = env.step(a)
    episode_reward += rew
    if done:
      break
  env.render();
  if not done:
    print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
  else:
    print('# Max Steps: ',max_steps)
    print("Episode reward: %f" % episode_reward)

In [6]:
# Inspect the deterministic environment
env_d = gym.make("Deterministic-4x4-FrozenLake-v0")
print('probability, nextstate, reward, terminal')
env_d.P[0]

probability, nextstate, reward, terminal


{0: [(1.0, 0, 0.0, False)],
 1: [(1.0, 4, 0.0, False)],
 2: [(1.0, 1, 0.0, False)],
 3: [(1.0, 0, 0.0, False)]}

In [7]:
# Inspect the stochastic environment
env_s = gym.make("Stochastic-4x4-FrozenLake-v0")
print('probability, nextstate, reward, terminal')
env_s.P[0]

probability, nextstate, reward, terminal


{0: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False)],
 1: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False)],
 2: [(0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)],
 3: [(0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)]}

In [8]:
def policy_evaluation(P, nS, nA, policy, gamma=1, tol=1e-3, run_num_episodes=5):
        
    # Initialize value function & delta
    value_function = np.zeros(nS)
    delta = np.inf
    episode = 0
    
    print('SANITY CHECK')
    print('Env. Action Probability:',P[0][0])
    print('Initial Value Function',value_function)
    print('Policy',policy)
    
    # Policy eval. will terminate when the value function's change is below the threshold   
    while episode < run_num_episodes:
    # while delta >= tol:    
        
        print('\nEpisode:',episode, ' &  Value Function',value_function)
        
        # Why do we loop through all (16) states? 
        for s in range(nS):   
            v = value_function[s]
            a = policy[s]
            
            for parameter in range(len(P[s][a])):
            
                prob = P[s][a][parameter][0]  
                nextstate = P[s][a][parameter][1]
                reward = P[s][a][parameter][2]
                done = P[s][a][parameter][3]
                
                print('\nstate:',s)
                print('action:',policy[s])
                print('P:',P[s][a])
                print('prob:',prob)
                print('value nextstate:',value_function[nextstate])
                print('reward:',reward)
                print('done:',done)
                
                state_value = prob * (reward + gamma * value_function[nextstate])
                print('State Value:',state_value)
                
                value_function[s] += prob * (reward + gamma * value_function[nextstate]) 
                print('value_function[s]:',value_function[s])
                                       
                # Compute the change in value functions across states
                delta = max(delta, np.abs(v - value_function[s]))

        episode += 1
        
        """
        if episode % 10 == 0:
            print('Episode: ',episode)
            print('value_function: ',V)
        """
        
    # Final value function
    print('Final # of Episodes: ',episode)
        
    return value_function

# Random Deterministic Policy

In [9]:
rpolicy = np.random.choice(env_d.nA, env_d.nS)
rpolicy[14] = 2
print('\nPolicy: ',rpolicy)


Policy:  [0 0 2 1 2 0 3 3 1 0 1 2 3 1 2 0]


In [10]:
# Evaluate a Stochastic Zeros Policy 
print("\n" + "-"*31 + "\nBeginning Random Policy Iteration\n" + "-"*31)
state_values = policy_evaluation(env_d.P, env_d.nS, env_d.nA, rpolicy, gamma=1, tol=1e-3)

# Examine a Stochastic Zeros Policy & Values
print('\nPolicy: ',rpolicy)
print('Values: ',state_values)


-------------------------------
Beginning Random Policy Iteration
-------------------------------
SANITY CHECK
Env. Action Probability: [(1.0, 0, 0.0, False)]
Initial Value Function [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Policy [0 0 2 1 2 0 3 3 1 0 1 2 3 1 2 0]

Episode: 0  &  Value Function [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

state: 0
action: 0
P: [(1.0, 0, 0.0, False)]
prob: 1.0
value nextstate: 0.0
reward: 0.0
done: False
State Value: 0.0
value_function[s]: 0.0

state: 1
action: 0
P: [(1.0, 0, 0.0, False)]
prob: 1.0
value nextstate: 0.0
reward: 0.0
done: False
State Value: 0.0
value_function[s]: 0.0

state: 2
action: 2
P: [(1.0, 3, 0.0, False)]
prob: 1.0
value nextstate: 0.0
reward: 0.0
done: False
State Value: 0.0
value_function[s]: 0.0

state: 3
action: 1
P: [(1.0, 7, 0.0, True)]
prob: 1.0
value nextstate: 0.0
reward: 0.0
done: True
State Value: 0.0
value_function[s]: 0.0

state: 4
action: 2
P: [(1.0, 5, 0.0, True)]
prob: 1.0
value nextstate: 0.0
reward:

In [25]:
render_single(env_d, policy, max_steps=50)

NameError: name 'policy' is not defined

# Random Stochastic Policy

In [5]:
policy = np.random.choice(env_s.nA, env_s.nS)
print('\nPolicy: ',policy)


Policy:  [1 3 3 3 0 3 3 0 0 2 1 3 1 2 3 3]


In [None]:
# Evaluate a Stochastic Zeros Policy 
print("\n" + "-"*31 + "\nBeginning Zeros Policy Iteration\n" + "-"*31)
state_values = policy_evaluation(env_s.P, env_s.nS, env_s.nA, policy, gamma=1, tol=1e-3)

# Examine a Stochastic Zeros Policy & Values
print('\nPolicy: ',policy)
print('Values: ',state_values)

In [None]:
# Inspect Behavior for a Stochastic Zeros Policy
render_single(env_s, policy, max_steps=5)

# Policy Evaluation

In [12]:
## Policy Evaluation 
"""
                    # ------------ Deviation from 4.1 algorithm ------------ #
                    # Loop through set of possible next actions
                    
                    for a, action_prob in enumerate(policy[s]):
                        # For each action, look at its possible next state
                        for prob, next_state, reward, done in P[s][a]:

                            # Calculate the expected value using equation 4.6
                            v += action_prob * prob * (reward + gamma * ) 
                            
                    # ------------ Deviation from 4.1 algorithm ------------ #
                    """

'\n                    # ------------ Deviation from 4.1 algorithm ------------ #\n                    # Loop through set of possible next actions\n                    \n                    for a, action_prob in enumerate(policy[s]):\n                        # For each action, look at its possible next state\n                        for prob, next_state, reward, done in P[s][a]:\n\n                            # Calculate the expected value using equation 4.6\n                            v += action_prob * prob * (reward + gamma * ) \n                            \n                    # ------------ Deviation from 4.1 algorithm ------------ #\n                    '