In [14]:
import time

import matplotlib.pyplot as plt

from baserl.common import *
from baserl.grid_world import GridWorld

In [15]:
%matplotlib inline

In [16]:
# Applying Iterative Policy Evaluation starting from the random policy, to reproduce the results from the RL book.
# The optimal policy (as determined by the greedy policy built from the current value function) is reached very fast, in just
# a couple of iterations, while it takes 88 or so iterations to compute the accurate value function for this optimal policy.
# As noted in the book, an optimal value function is not needed to reach a good/optimal policy.
mdp = GridWorld()
ranom_policy = make_random_policy(mdp.states(), mdp.actions)
v = iterative_policy_evaluation(ranom_policy,
                            theta=0.001,
                            states=mdp.states(),
                            is_terminal=mdp.is_terminal,
                            actions=mdp.actions,
                            transitions=mdp.transitions,
                            gamma=1.0,
                            in_place=True,
                            max_iter=100,
                            print_value=mdp.print_value,
                            print_policy=mdp.print_policy,
                            print_every_n=5)


Initial value function:
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 

Initial greedy policy:
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 

value function at iteration 5
0.00 -4.49 -6.30 -6.91 
-4.49 -6.26 -7.22 -7.37 
-6.30 -7.22 -7.19 -5.93 
-6.91 -7.37 -5.93 0.00 

greedy policy at iteration 5
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

iterative_policy_evaluation: num iter= 5 delta= 1.3317079544067383
value function at iteration 10
0.00 -7.83 -11.12 -12.23 
-7.83 -10.42 -11.77 -11.86 
-11.12 -11.77 -11.05 -8.81 
-12.23 -11.86 -8.81 0.00 

greedy policy at iteration 10
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

iterative_policy_evaluation: num iter= 10 delta= 0.8924059502996897
value function at iteration 15
0.00 -10.01 -14.27 -15.69 
-10.01 -13.11 -14.69 -14.75 
-14.27 -14.69 -13.52 -10.65 
-15.69 -14.75 -10.65 0.00 

greedy

In [17]:
# Let's also run Value Iteration - this is not required in the book.
mdp = GridWorld()
start_time = time.time()
mdp_policy, mdp_v = value_iteration(
        states=mdp.states(), 
        is_terminal=mdp.is_terminal, 
        actions=mdp.actions,
        transitions=mdp.transitions,
        gamma=mdp.gamma(),
        delta_threshold=0.001,
        max_iter=100,
        print_value=mdp.print_value,
        print_policy=mdp.print_policy)
print("Done in time:", time.time()-start_time)

delta at iteration: 1 1.0
delta at iteration: 2 1.0
delta at iteration: 3 1.0
delta at iteration: 4 0
value function at iteration 4
0.00 -1.00 -2.00 -3.00 
-1.00 -2.00 -3.00 -2.00 
-2.00 -3.00 -2.00 -1.00 
-3.00 -2.00 -1.00 0.00 

policy:
  UL    L    L   DL 
   U   UL UDRL    D 
   U UDRL   DR    D 
  UR    R    R   DR 

Done in time: 0.0015058517456054688
