In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

In [2]:
import random
import sys
import time

In [3]:
from baserl.common import *
from baserl.grid_world import GridWorld
from baserl.jacks_rental import JacksRental

In [4]:
random.seed(42)
np.random.seed(42)

In [5]:
# Running Iterative Policy Evaluation on the Grid World problem, both the in-place and out-of-place versions.
# We mentioned in the book, the in-place converges in fewer iterations, for instance 114 vs 173.
# In particular, we notice that the results in fig 4.1 on page 62 are reproduced with the out-of-place version (where
# we compute the new version of V using a copy of the old version), while the book mentioned that those results are for
# in-place - that is weird.
grid_world = GridWorld()
for in_place in [True, False]:
    V = iterative_policy_evaluation(
        states=grid_world.states(),
        is_terminal=grid_world.is_terminal,
        actions = grid_world.actions,
        transitions = grid_world.transitions,
        policy=make_random_policy(grid_world.states(), grid_world.actions),
        gamma=grid_world.gamma(),
        theta=0.0001,
        in_place=in_place,
        print_value=grid_world.print_value,
        print_policy=grid_world.print_policy)
    print('in_place:', in_place, 'avg value:', np.mean([v for v in V.values()]))
    print()

Initial value function:
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 

Initial greedy policy:
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 
UDRL UDRL UDRL UDRL 

value function at iteration 1
0.00 -1.00 -1.25 -1.31 
-1.00 -1.50 -1.69 -1.75 
-1.25 -1.69 -1.84 -1.90 
-1.31 -1.75 -1.90 0.00 

greedy policy at iteration 1
  UL    L    L    L 
   U   UL    U    U 
   U    L   UL    D 
   U    L    R   DR 

value function at iteration 2
0.00 -1.94 -2.55 -2.73 
-1.94 -2.81 -3.24 -3.40 
-2.55 -3.24 -3.57 -3.22 
-2.73 -3.40 -3.22 0.00 

greedy policy at iteration 2
  UL    L    L    L 
   U   UL    U    U 
   U    L   DR    D 
   U    L    R   DR 

value function at iteration 3
0.00 -2.82 -3.83 -4.18 
-2.82 -4.03 -4.71 -4.88 
-3.83 -4.71 -4.96 -4.26 
-4.18 -4.88 -4.26 0.00 

greedy policy at iteration 3
  UL    L    L    L 
   U   UL    U    U 
   U    L   DR    D 
   U    L    R   DR 

value function at iteration 4
0.00 -3.67 -5.10 -5.58 


value function at iteration 59
0.00 -13.92 -19.88 -21.87 
-13.92 -17.90 -19.89 -19.89 
-19.88 -19.89 -17.90 -13.93 
-21.87 -19.89 -13.93 0.00 

greedy policy at iteration 59
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

value function at iteration 60
0.00 -13.92 -19.89 -21.88 
-13.92 -17.90 -19.90 -19.90 
-19.89 -19.90 -17.91 -13.93 
-21.88 -19.90 -13.93 0.00 

greedy policy at iteration 60
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

value function at iteration 61
0.00 -13.93 -19.90 -21.89 
-13.93 -17.91 -19.91 -19.91 
-19.90 -19.91 -17.92 -13.94 
-21.89 -19.91 -13.94 0.00 

greedy policy at iteration 61
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

value function at iteration 62
0.00 -13.93 -19.91 -21.90 
-13.93 -17.92 -19.91 -19.91 
-19.91 -19.91 -17.93 -13.95 
-21.90 -19.91 -13.95 0.00 

greedy policy at iteration 62
  UL    L    L    L 
   U   UL    L    D 
   U   

   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 36
0.00 -12.10 -17.19 -18.85 
-12.10 -15.52 -17.21 -17.19 
-17.19 -17.21 -15.52 -12.10 
-18.85 -17.19 -12.10 0.00 

greedy policy at iteration 36
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 37
0.00 -12.20 -17.34 -19.02 
-12.20 -15.65 -17.36 -17.34 
-17.34 -17.36 -15.65 -12.20 
-19.02 -17.34 -12.20 0.00 

greedy policy at iteration 37
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 38
0.00 -12.30 -17.48 -19.18 
-12.30 -15.78 -17.50 -17.48 
-17.48 -17.50 -15.78 -12.30 
-19.18 -17.48 -12.30 0.00 

greedy policy at iteration 38
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 39
0.00 -12.39 -17.61 -19.33 
-12.39 -15.90 -17.63 -17.61 
-17.61 -17.63 -15.90 -12.39 
-19.33 -17.61 -12.39 0.00 

greedy policy a

  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 123
0.00 -13.98 -19.98 -21.97 
-13.98 -17.98 -19.98 -19.98 
-19.98 -19.98 -17.98 -13.98 
-21.97 -19.98 -13.98 0.00 

greedy policy at iteration 123
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 124
0.00 -13.98 -19.98 -21.97 
-13.98 -17.98 -19.98 -19.98 
-19.98 -19.98 -17.98 -13.98 
-21.97 -19.98 -13.98 0.00 

greedy policy at iteration 124
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 125
0.00 -13.99 -19.98 -21.98 
-13.99 -17.98 -19.98 -19.98 
-19.98 -19.98 -17.98 -13.99 
-21.98 -19.98 -13.99 0.00 

greedy policy at iteration 125
  UL    L    L   DL 
   U   UL   DL    D 
   U   UR   DR    D 
  UR    R    R   DR 

value function at iteration 126
0.00 -13.99 -19.98 -21.98 
-13.99 -17.98 -19.98 -19.98 
-19.98 -19.98 -17.98 -13.99 
-21.98 -19.98 -

In [6]:
# Running Iterative Policy Evaluation on the Jack's Rental problem
# in-place: 68 iterations
# out-of-place: 121 iterations
# Both converged to the same policy and value:
"""
0 0 0 0 0 -1 -1 -2 -2 -2 
0 0 0 0 0 0 -1 -1 -1 -2 
0 0 0 0 0 0 0 0 -1 -1 
0 0 0 0 0 0 0 0 0 0 
1 1 0 0 0 0 0 0 0 0 
2 1 1 0 0 0 0 0 0 0 
2 2 1 1 0 0 0 0 0 0 
3 2 2 1 1 0 0 0 0 0 
3 3 2 2 1 1 0 0 0 0 
4 3 3 2 2 1 1 0 0 0 
avg value: 422.44...
"""
jacks_rental = JacksRental()
for in_place in [True, False]:
    V = iterative_policy_evaluation(
        states=jacks_rental.states(),
        is_terminal=jacks_rental.is_terminal,
        actions = jacks_rental.actions,
        transitions = jacks_rental.transitions,
        policy=make_random_policy(jacks_rental.states(), jacks_rental.actions),
        gamma=jacks_rental.gamma(),
        theta=0.0001,
        in_place=in_place,
        print_value=jacks_rental.print_value,
        print_policy=jacks_rental.print_policy)
    print('in_place:', in_place, 'avg value:', np.mean([v for v in V.values()]))
    print()

Initial value function:
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 

Initial greedy policy:
 0  0  0  0 -1 -2 -2 -3 -3 -3 
 0  0  0  0 -1 -1 -2 -2 -2 -3 
 0  0  0  0  0 -1 -1 -1 -2 -2 
 1  1  0  0  0  0  0 -1 -1 -1 
 2  1  1  1  0  0  0  0  0  0 
 2  2  2  1  0  0  0  0  0  0 
 3  3  2  1  1  0  0  0  0  0 
 4  3  2  2  1  0  0  0  0  0 
 4  3  3  2  1  1  0  0  0  0 
 4  4  3  2  2  1  0  0  0  0 

value function at iteration 1
0.00 8.63 16.61 23.54 29.15 33.45 39.85 44.80 48.34 50.66 
9.31 17.99 25.57 31.80 36.49 39.86 45.73 5

288.49 297.77 306.50 314.37 321.38 327.69 335.34 342.27 348.07 352.82 
299.95 308.57 316.59 323.83 330.26 336.00 343.23 349.37 354.47 358.61 
310.00 318.04 325.47 332.15 338.04 343.20 349.67 355.12 359.61 363.24 
319.79 327.36 334.27 340.40 345.73 350.14 356.47 361.77 366.11 369.59 
328.44 335.54 341.92 347.50 352.10 355.91 362.11 367.27 371.50 374.89 
336.16 342.75 348.57 353.46 357.51 360.86 366.94 372.00 376.15 379.49 
342.98 349.01 354.18 358.56 362.22 365.24 371.22 376.21 380.30 383.61 

greedy policy at iteration 8
 0  0  0  0  0 -1 -1 -2 -3 -3 
 0  0  0  0  0  0 -1 -2 -2 -2 
 0  0  0  0  0  0 -1 -1 -1 -2 
 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 

value function at iteration 9
273.19 282.21 291.18 299.85 308.08 315.84 324.53 332.60 339.99 346.68 
282.90 292.59 301.88 310.52 318.44 325.73 334.08 341.78 348.7

 0  0  0  0  0  0  0  0  0  0 
 1  0  0  0  0  0  0  0  0  0 
 1  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  0  0  0  0  0  0 
 3  3  2  1  1  0  0  0  0  0 
 4  3  2  2  1  1  0  0  0  0 

value function at iteration 16
334.43 343.31 351.97 360.23 367.97 375.18 383.72 391.61 398.80 405.26 
343.48 352.81 361.47 369.46 376.81 383.55 391.73 399.25 406.05 412.09 
352.61 361.87 370.28 377.91 384.84 391.15 398.96 406.10 412.48 417.64 
361.52 370.41 378.41 385.62 392.12 398.00 405.43 412.16 417.71 422.16 
369.86 378.27 385.81 392.59 398.67 404.09 411.13 417.06 421.92 425.78 
377.47 385.40 392.50 398.85 404.48 409.44 415.72 420.99 425.26 428.63 
385.59 393.07 399.70 405.56 410.69 414.98 421.14 426.26 430.39 433.64 
392.80 399.81 405.96 411.31 415.77 419.48 425.50 430.48 434.50 437.67 
399.10 405.63 411.27 415.94 419.83 423.07 428.96 433.83 437.76 440.86 
404.51 410.55 415.50 419.62 423.05 425.90 431.67 436.46 440.33 443.40 

greedy policy at iteration 16
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  0  0  0  0  0 
 4  3  3  2  1  1  1  0  0  0 

value function at iteration 24
349.69 358.55 367.13 375.28 382.91 389.97 398.48 406.33 413.46 419.86 
358.58 367.83 376.33 384.16 391.36 397.97 406.10 413.58 420.33 426.32 
367.25 376.40 384.64 392.11 398.90 405.10 412.87 419.96 426.29 431.41 
375.53 384.34 392.20 399.28 405.69 411.48 418.87 425.55 431.06 435.45 
383.27 391.64 399.09 405.78 411.78 417.15 424.15 430.04 434.85 438.66 
390.41 398.32 405.36 411.64 417.22 422.14 428.39 433.61 437.85 441.17 
398.21 405.67 412.25 418.06 423.15 427.41 433.53 438.62 442.71 445.92 
405.14 412.14 418.24 423.55 427.97 431.67 437.65 442.60 446.58 449.70 
411.17 417.69 423.29 427.92 431.78 435.00 440.85 445.68 449.57 452.63 
416.31 422.35 427.26 431.32 434.71 437.53 443.26 448.01 451.84 454.86 

greedy policy at iteration 24
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 32
352.62 361.47 370.04 378.17 385.77 392.81 401.31 409.15 416.27 422.66 
361.48 370.71 379.18 386.98 394.15 400.73 408.86 416.33 423.06 429.05 
370.06 379.19 387.39 394.83 401.60 407.77 415.54 422.62 428.94 434.04 
378.22 387.01 394.85 401.90 408.29 414.06 421.45 428.12 433.62 438.00 
385.84 394.20 401.63 408.30 414.30 419.65 426.64 432.53 437.33 441.13 
392.89 400.80 407.82 414.09 419.67 424.57 430.82 436.04 440.26 443.58 
400.63 408.09 414.65 420.45 425.53 429.80 435.91 440.99 445.08 448.28 
407.50 414.50 420.59 425.89 430.31 434.01 439.98 444.93 448.90 452.01 
413.48 420.00 425.60 430.22 434.07 437.29 443.13 447.95 451.83 454.88 
418.57 424.61 429.51 433.57 436.95 439.76 445.49 450.22 454.04 457.06 

greedy policy at iteration 32
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 40
353.18 362.03 370.60 378.72 386.32 393.35 401.85 409.69 416.81 423.20 
362.03 371.26 379.72 387.52 394.68 401.26 409.39 416.85 423.59 429.57 
370.60 379.73 387.92 395.35 402.12 408.28 416.05 423.13 429.45 434.55 
378.73 387.52 395.35 402.40 408.79 414.56 421.95 428.62 434.11 438.49 
386.33 394.69 402.12 408.79 414.78 420.13 427.12 433.01 437.81 441.60 
393.37 401.27 408.29 414.56 420.14 425.04 431.28 436.50 440.72 444.04 
401.09 408.55 415.12 420.91 425.99 430.25 436.37 441.44 445.53 448.73 
407.96 414.95 421.04 426.34 430.76 434.46 440.43 445.37 449.34 452.45 
413.92 420.45 426.04 430.66 434.51 437.73 443.56 448.39 452.27 455.32 
419.01 425.05 429.95 434.00 437.37 440.19 445.91 450.65 454.47 457.48 

greedy policy at iteration 40
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 48
353.29 362.14 370.70 378.83 386.42 393.46 401.95 409.79 416.91 423.30 
362.14 371.37 379.83 387.62 394.78 401.36 409.49 416.95 423.69 429.67 
370.70 379.83 388.02 395.45 402.22 408.38 416.15 423.23 429.55 434.65 
378.83 387.62 395.45 402.50 408.88 414.65 422.04 428.71 434.20 438.58 
386.43 394.79 402.22 408.88 414.87 420.22 427.21 433.10 437.90 441.69 
393.46 401.36 408.38 414.65 420.23 425.13 431.37 436.59 440.81 444.12 
401.18 408.64 415.20 421.00 426.08 430.34 436.45 441.53 445.62 448.81 
408.04 415.04 421.13 426.43 430.85 434.54 440.52 445.46 449.43 452.53 
414.01 420.53 426.13 430.75 434.60 437.81 443.65 448.47 452.35 455.40 
419.09 425.13 430.03 434.08 437.46 440.27 445.99 450.73 454.55 457.56 

greedy policy at iteration 48
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 56
353.31 362.16 370.72 378.85 386.44 393.48 401.97 409.81 416.93 423.32 
362.16 371.39 379.85 387.64 394.80 401.38 409.51 416.97 423.71 429.69 
370.72 379.85 388.04 395.47 402.23 408.40 416.17 423.25 429.57 434.67 
378.85 387.64 395.47 402.52 408.90 414.67 422.06 428.73 434.22 438.60 
386.45 394.80 402.23 408.90 414.89 420.24 427.23 433.12 437.91 441.71 
393.48 401.38 408.40 414.67 420.24 425.14 431.39 436.61 440.83 444.14 
401.20 408.65 415.22 421.02 426.10 430.36 436.47 441.55 445.63 448.83 
408.06 415.06 421.15 426.44 430.87 434.56 440.53 445.47 449.44 452.55 
414.02 420.55 426.14 430.76 434.61 437.83 443.66 448.49 452.37 455.41 
419.11 425.14 430.04 434.09 437.47 440.29 446.01 450.74 454.56 457.57 

greedy policy at iteration 56
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 64
353.32 362.16 370.73 378.86 386.45 393.48 401.98 409.81 416.93 423.33 
362.16 371.39 379.85 387.64 394.81 401.38 409.51 416.98 423.71 429.69 
370.73 379.85 388.04 395.47 402.24 408.41 416.17 423.25 429.57 434.67 
378.86 387.64 395.47 402.52 408.90 414.67 422.06 428.73 434.22 438.60 
386.45 394.81 402.24 408.90 414.89 420.25 427.24 433.12 437.92 441.71 
393.48 401.38 408.41 414.67 420.25 425.15 431.39 436.61 440.83 444.14 
401.20 408.66 415.22 421.02 426.10 430.36 436.47 441.55 445.64 448.83 
408.06 415.06 421.15 426.45 430.87 434.56 440.54 445.48 449.45 452.55 
414.03 420.55 426.15 430.76 434.62 437.83 443.67 448.49 452.37 455.42 
419.11 425.15 430.05 434.10 437.48 440.29 446.01 450.75 454.57 457.58 

greedy policy at iteration 64
 0  0  0  

 1  1  0  0  0  0  0  0  0 -1 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  0  0  0  0  0 
 4  3  3  2  1  1  0  0  0  0 
 4  4  3  2  2  1  0  0  0  0 

value function at iteration 2
29.13 37.89 46.18 53.73 60.39 66.14 73.62 80.03 85.33 89.60 
37.89 46.84 54.72 61.59 67.47 72.46 79.36 85.23 90.04 93.89 
46.18 54.72 61.98 68.13 73.31 77.66 84.02 89.40 93.77 97.03 
53.73 61.59 68.13 73.59 78.16 81.97 87.83 92.74 96.53 99.33 
60.39 67.47 73.31 78.16 82.19 85.52 90.92 95.25 98.56 101.00 
66.14 72.46 77.66 81.97 85.52 88.42 93.25 97.10 100.04 102.18 
72.48 78.19 82.83 86.62 89.70 92.15 96.89 100.66 103.53 105.63 
77.57 82.73 86.87 90.21 92.85 94.92 99.59 103.29 106.11 108.18 
81.45 86.14 89.87 92.79 95.07 96.86 101.46 105.12 107.91 109.95 
84.32 88.62 91.95 94.55 96.58 98.17 102.72 106.34 109.11 111.14 

greedy policy at iteration 2
 0  0  0  0  0 -1 -2 -2 -2 -3 
 0  0  0  0  0 -1 -1 -1 -2 -2 
 0  0  0  0  0  0  0 -1 -1 -1 
 1

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 11
228.20 237.04 245.61 253.73 261.32 268.35 276.85 284.68 291.79 298.18 
237.04 246.27 254.73 262.52 269.68 276.25 284.38 291.83 298.56 304.54 
245.61 254.73 262.92 270.34 277.10 283.27 291.03 298.10 304.41 309.51 
253.73 262.52 270.34 277.39 283.76 289.53 296.91 303.57 309.06 313.44 
261.32 269.68 277.10 283.76 289.75 295.09 302.08 307.95 312.75 316.54 
268.35 276.25 283.27 289.53 295.09 299.99 306.23 311.44 315.66 318.96 
276.07 283.52 290.08 295.87 300.94 305.19 311.30 316.37 320.45 323.65 
282.92 289.92 296.00 301.29 305.70 309.39 315.36 320.29 324.26 327.36 
288.88 295.40 300.98 305.60 309.44 312.65 318.48 323.30 327.18 330.22 
293.95 299.99 304.88 308.92 312.30 315.11 320.82 325.56 329.37 332.38 

greedy policy at iteration 11
 0  0  0  

318.32 327.44 335.63 343.06 349.83 355.99 363.76 370.84 377.16 382.26 
326.44 335.23 343.06 350.11 356.49 362.26 369.65 376.32 381.81 386.19 
334.04 342.40 349.83 356.49 362.48 367.83 374.82 380.71 385.51 389.30 
341.07 348.97 355.99 362.26 367.83 372.74 378.98 384.20 388.42 391.73 
348.79 356.25 362.81 368.61 373.69 377.95 384.06 389.14 393.22 396.42 
355.65 362.65 368.74 374.04 378.46 382.15 388.12 393.07 397.03 400.14 
361.62 368.14 373.73 378.35 382.21 385.42 391.25 396.08 399.96 403.01 
366.70 372.74 377.64 381.69 385.06 387.88 393.60 398.34 402.15 405.17 

greedy policy at iteration 19
 0  0  0  0  0 -1 -1 -2 -2 -2 
 0  0  0  0  0  0 -1 -1 -1 -2 
 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 20
306.31 315.15 323.72 331.85 339.44 346.47 354.97 362.81 369

356.91 365.70 373.52 380.57 386.95 392.72 400.11 406.78 412.27 416.66 
364.50 372.86 380.29 386.95 392.95 398.30 405.29 411.17 415.97 419.76 
371.53 379.43 386.46 392.72 398.30 403.20 409.44 414.66 418.88 422.20 
379.25 386.71 393.28 399.07 404.15 408.41 414.53 419.60 423.69 426.88 
386.11 393.11 399.20 404.50 408.92 412.61 418.59 423.53 427.50 430.60 
392.08 398.60 404.20 408.82 412.67 415.88 421.72 426.54 430.42 433.47 
397.16 403.20 408.10 412.15 415.53 418.34 424.06 428.80 432.62 435.63 

greedy policy at iteration 27
 0  0  0  0  0 -1 -1 -2 -2 -2 
 0  0  0  0  0  0 -1 -1 -1 -2 
 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 28
333.63 342.48 351.04 359.17 366.76 373.79 382.29 390.13 397.25 403.64 
342.48 351.71 360.17 367.96 375.12 381.70 389.83 397.29 404

361.54 370.66 378.85 386.28 393.05 399.21 406.98 414.06 420.38 425.48 
369.66 378.45 386.28 393.33 399.71 405.48 412.87 419.54 425.03 429.41 
377.26 385.62 393.05 399.71 405.70 411.05 418.04 423.93 428.73 432.52 
384.29 392.19 399.21 405.48 411.05 415.96 422.20 427.42 431.64 434.95 
392.01 399.47 406.03 411.83 416.91 421.17 427.28 432.36 436.44 439.64 
398.87 405.87 411.96 417.26 421.68 425.37 431.34 436.28 440.25 443.36 
404.84 411.36 416.95 421.57 425.43 428.64 434.47 439.30 443.18 446.23 
409.92 415.96 420.86 424.91 428.28 431.10 436.82 441.56 445.37 448.38 

greedy policy at iteration 35
 0  0  0  0  0 -1 -1 -2 -2 -2 
 0  0  0  0  0  0 -1 -1 -1 -2 
 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 36
345.07 353.92 362.48 370.61 378.21 385.24 393.73 401.57 408

 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 43
349.47 358.31 366.88 375.01 382.60 389.63 398.13 405.97 413.09 419.48 
358.31 367.54 376.00 383.80 390.96 397.53 405.66 413.13 419.86 425.84 
366.88 376.00 384.19 391.62 398.39 404.56 412.32 419.40 425.72 430.82 
375.01 383.80 391.62 398.67 405.05 410.82 418.21 424.88 430.37 434.76 
382.60 390.96 398.39 405.05 411.05 416.40 423.39 429.27 434.07 437.86 
389.63 397.53 404.56 410.82 416.40 421.30 427.54 432.76 436.98 440.30 
397.35 404.81 411.38 417.17 422.25 426.51 432.63 437.70 441.79 444.98 
404.21 411.21 417.30 422.60 427.02 430.71 436.69 441.63 445.60 448.70 
410.18 416.70 422.30 426.92 430.77 433.98 439.82 444.64 448.52 451.57 
415.26 421.30 426.20 430.25 433.63 436.44 442.16 446.90 450.72 453.73 

greedy po

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 51
351.70 360.55 369.12 377.24 384.84 391.87 400.37 408.20 415.32 421.71 
360.55 369.78 378.24 386.03 393.20 399.77 407.90 415.37 422.10 428.08 
369.12 378.24 386.43 393.86 400.63 406.79 414.56 421.64 427.96 433.06 
377.24 386.03 393.86 400.91 407.29 413.06 420.45 427.12 432.61 436.99 
384.84 393.20 400.63 407.29 413.28 418.63 425.62 431.51 436.31 440.10 
391.87 399.77 406.79 413.06 418.63 423.54 429.78 435.00 439.22 442.53 
399.59 407.05 413.61 419.41 424.49 428.75 434.86 439.94 444.02 447.22 
406.45 413.45 419.54 424.84 429.26 432.95 438.92 443.87 447.83 450.94 
412.42 418.94 424.53 429.15 433.01 436.22 442.05 446.88 450.76 453.81 
417.50 423.54 428.44 432.49 435.86 438.68 444.40 449.14 452.95 455.97 

greedy policy at iteration 51
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 59
352.64 361.49 370.05 378.18 385.77 392.80 401.30 409.14 416.26 422.65 
361.49 370.72 379.18 386.97 394.13 400.71 408.84 416.30 423.04 429.02 
370.05 379.18 387.37 394.80 401.56 407.73 415.50 422.58 428.90 434.00 
378.18 386.97 394.80 401.85 408.23 414.00 421.39 428.06 433.55 437.93 
385.77 394.13 401.56 408.23 414.22 419.57 426.56 432.44 437.24 441.04 
392.80 400.71 407.73 414.00 419.57 424.47 430.72 435.94 440.16 443.47 
400.53 407.98 414.55 420.34 425.43 429.69 435.80 440.88 444.96 448.16 
407.39 414.39 420.48 425.77 430.19 433.89 439.86 444.80 448.77 451.88 
413.35 419.88 425.47 430.09 433.94 437.16 442.99 447.82 451.70 454.74 
418.44 424.47 429.37 433.42 436.80 439.62 445.34 450.07 453.89 456.90 

greedy policy at iteration 59
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 67
353.03 361.88 370.44 378.57 386.17 393.20 401.69 409.53 416.65 423.04 
361.88 371.11 379.57 387.36 394.53 401.10 409.23 416.69 423.43 429.41 
370.44 379.57 387.76 395.19 401.96 408.12 415.89 422.97 429.29 434.39 
378.57 387.36 395.19 402.24 408.62 414.39 421.78 428.45 433.94 438.32 
386.17 394.53 401.96 408.62 414.61 419.96 426.95 432.84 437.64 441.43 
393.20 401.10 408.12 414.39 419.96 424.87 431.11 436.33 440.55 443.86 
400.92 408.38 414.94 420.74 425.82 430.08 436.19 441.27 445.35 448.55 
407.78 414.78 420.87 426.17 430.59 434.28 440.25 445.19 449.16 452.27 
413.75 420.27 425.86 430.48 434.33 437.55 443.38 448.21 452.09 455.13 
418.83 424.87 429.77 433.82 437.19 440.01 445.73 450.46 454.28 457.29 

greedy policy at iteration 67
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 75
353.20 362.04 370.61 378.74 386.33 393.36 401.86 409.70 416.82 423.21 
362.04 371.27 379.73 387.53 394.69 401.27 409.40 416.86 423.59 429.58 
370.61 379.73 387.93 395.35 402.12 408.29 416.05 423.13 429.45 434.55 
378.74 387.53 395.35 402.41 408.79 414.56 421.94 428.61 434.10 438.49 
386.33 394.69 402.12 408.79 414.78 420.13 427.12 433.00 437.80 441.60 
393.36 401.27 408.29 414.56 420.13 425.03 431.27 436.49 440.71 444.03 
401.08 408.54 415.11 420.90 425.98 430.24 436.36 441.43 445.52 448.72 
407.95 414.94 421.03 426.33 430.75 434.45 440.42 445.36 449.33 452.44 
413.91 420.43 426.03 430.65 434.50 437.71 443.55 448.37 452.25 455.30 
418.99 425.03 429.93 433.98 437.36 440.17 445.89 450.63 454.45 457.46 

greedy policy at iteration 75
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 83
353.27 362.11 370.68 378.81 386.40 393.43 401.93 409.77 416.89 423.28 
362.11 371.34 379.80 387.60 394.76 401.34 409.46 416.93 423.66 429.64 
370.68 379.80 387.99 395.42 402.19 408.36 416.12 423.20 429.52 434.62 
378.81 387.60 395.42 402.47 408.86 414.62 422.01 428.68 434.17 438.56 
386.40 394.76 402.19 408.86 414.85 420.20 427.19 433.07 437.87 441.66 
393.43 401.34 408.36 414.62 420.20 425.10 431.34 436.56 440.78 444.10 
401.15 408.61 415.18 420.97 426.05 430.31 436.43 441.50 445.59 448.78 
408.02 415.01 421.10 426.40 430.82 434.51 440.49 445.43 449.40 452.50 
413.98 420.50 426.10 430.72 434.57 437.78 443.62 448.44 452.32 455.37 
419.06 425.10 430.00 434.05 437.43 440.24 445.96 450.70 454.52 457.53 

greedy policy at iteration 83
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 91
353.30 362.14 370.71 378.84 386.43 393.46 401.96 409.80 416.91 423.31 
362.14 371.37 379.83 387.62 394.79 401.36 409.49 416.96 423.69 429.67 
370.71 379.83 388.02 395.45 402.22 408.39 416.15 423.23 429.55 434.65 
378.84 387.62 395.45 402.50 408.88 414.65 422.04 428.71 434.20 438.58 
386.43 394.79 402.22 408.88 414.87 420.23 427.22 433.10 437.90 441.69 
393.46 401.36 408.39 414.65 420.23 425.13 431.37 436.59 440.81 444.12 
401.18 408.64 415.20 421.00 426.08 430.34 436.45 441.53 445.62 448.81 
408.04 415.04 421.13 426.43 430.85 434.54 440.52 445.46 449.43 452.53 
414.01 420.53 426.13 430.74 434.60 437.81 443.65 448.47 452.35 455.40 
419.09 425.13 430.03 434.08 437.46 440.27 445.99 450.73 454.55 457.56 

greedy policy at iteration 91
 0  0  0  

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 99
353.31 362.15 370.72 378.85 386.44 393.47 401.97 409.81 416.93 423.32 
362.15 371.38 379.84 387.64 394.80 401.38 409.51 416.97 423.70 429.69 
370.72 379.84 388.04 395.46 402.23 408.40 416.16 423.24 429.56 434.66 
378.85 387.64 395.46 402.51 408.90 414.66 422.05 428.72 434.21 438.60 
386.44 394.80 402.23 408.90 414.89 420.24 427.23 433.11 437.91 441.71 
393.47 401.38 408.40 414.66 420.24 425.14 431.38 436.60 440.82 444.14 
401.19 408.65 415.22 421.01 426.09 430.35 436.47 441.54 445.63 448.83 
408.06 415.05 421.14 426.44 430.86 434.56 440.53 445.47 449.44 452.55 
414.02 420.54 426.14 430.76 434.61 437.82 443.66 448.48 452.36 455.41 
419.10 425.14 430.04 434.09 437.47 440.28 446.00 450.74 454.56 457.57 

greedy policy at iteration 99
 0  0  0  

 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 107
353.31 362.16 370.72 378.85 386.45 393.48 401.97 409.81 416.93 423.32 
362.16 371.39 379.85 387.64 394.81 401.38 409.51 416.97 423.71 429.69 
370.72 379.85 388.04 395.47 402.24 408.40 416.17 423.25 429.57 434.67 
378.85 387.64 395.47 402.52 408.90 414.67 422.06 428.73 434.22 438.60 
386.45 394.81 402.24 408.90 414.89 420.24 427.23 433.12 437.92 441.71 
393.48 401.38 408.40 414.67 420.24 425.14 431.39 436.61 440.83 444.14 
401.20 408.66 415.22 421.02 426.10 430.36 436.47 441.55 445.63 448.83 
408.06 415.06 421.15 426.44 430.87 434.56 440.53 445.47 449.44 452.55 
414.02 420.55 426.14 430.76 434.61 437.83 443.66 448.49 452.37 455.41 
419.11 425.14 430.04 434.09 437.47 440.29 446.01 450.74 454.56 457.57 

greedy p

 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  1  0  0  0  0  0 
 3  3  2  2  1  1  0  0  0  0 
 4  3  3  2  2  1  1  0  0  0 

value function at iteration 115
353.32 362.16 370.73 378.85 386.45 393.48 401.98 409.81 416.93 423.32 
362.16 371.39 379.85 387.64 394.81 401.38 409.51 416.98 423.71 429.69 
370.73 379.85 388.04 395.47 402.24 408.40 416.17 423.25 429.57 434.67 
378.85 387.64 395.47 402.52 408.90 414.67 422.06 428.73 434.22 438.60 
386.45 394.81 402.24 408.90 414.89 420.25 427.23 433.12 437.92 441.71 
393.48 401.38 408.40 414.67 420.25 425.15 431.39 436.61 440.83 444.14 
401.20 408.66 415.22 421.02 426.10 430.36 436.47 441.55 445.64 448.83 
408.06 415.06 421.15 426.45 430.87 434.56 440.54 445.48 449.44 452.55 
414.03 420.55 426.14 430.76 434.62 437.83 443.67 448.49 452.37 455.42 
419.11 425.15 430.05 434.10 437.47 440.29 446.01 450.75 454.57 457.58 

greedy policy at iteration 115
 0  0  0

In [7]:
# Applying the Policy Iteration algorithm to Grid World
grid_world = GridWorld()
grid_world_policy, grid_world_v = policy_iteration(
    states=grid_world.states(), 
    is_terminal=grid_world.is_terminal, 
    actions=grid_world.actions,
    transitions=grid_world.transitions,
    gamma=grid_world.gamma(),
    policy_evaluator=make_iterative_policy_evaluator(theta=0.0001, max_iter=150),
    delta_policy_improv=0.00000001,
    max_iter_policy_improv=10,
    print_value=grid_world.print_value,
    print_policy=grid_world.print_policy)

value function at iteration 1
0.00 -14.00 -20.00 -22.00 
-14.00 -18.00 -20.00 -20.00 
-20.00 -20.00 -18.00 -14.00 
-22.00 -20.00 -14.00 0.00 

greedy policy at iteration 1
  UL    L    L    L 
   U   UL    L    D 
   U    U   DR    D 
   U    R    R   DR 

value function at iteration 2
0.00 -1.00 -2.00 -3.00 
-1.00 -2.00 -3.00 -2.00 
-2.00 -3.00 -2.00 -1.00 
-3.00 -2.00 -1.00 0.00 

greedy policy at iteration 2
  UL    L    L   DL 
   U   UL UDRL    D 
   U UDRL   DR    D 
  UR    R    R   DR 

value function at iteration 3
0.00 -1.00 -2.00 -3.00 
-1.00 -2.00 -3.00 -2.00 
-2.00 -3.00 -2.00 -1.00 
-3.00 -2.00 -1.00 0.00 

greedy policy at iteration 3
  UL    L    L   DL 
   U   UL UDRL    D 
   U UDRL   DR    D 
  UR    R    R   DR 



In [8]:
# Applying the Policy Iteration algorithm to Jack's Rental
jacks_rental = JacksRental()
jacks_rental_policy, jacks_rental_v = policy_iteration(
    states=jacks_rental.states(), 
    is_terminal=jacks_rental.is_terminal, 
    actions=jacks_rental.actions,
    transitions=jacks_rental.transitions,
    gamma=jacks_rental.gamma(),
    policy_evaluator=make_iterative_policy_evaluator(theta=0.000001, max_iter=100),
    delta_policy_improv=0.000001,
    max_iter_policy_improv=10,
    print_value=jacks_rental.print_value,
    print_policy=jacks_rental.print_policy)

value function at iteration 1
353.32 362.16 370.73 378.86 386.45 393.48 401.98 409.82 416.94 423.33 
362.16 371.39 379.85 387.65 394.81 401.38 409.51 416.98 423.71 429.69 
370.73 379.85 388.04 395.47 402.24 408.41 416.17 423.25 429.57 434.67 
378.86 387.65 395.47 402.52 408.90 414.67 422.06 428.73 434.22 438.61 
386.45 394.81 402.24 408.90 414.90 420.25 427.24 433.12 437.92 441.71 
393.48 401.38 408.41 414.67 420.25 425.15 431.39 436.61 440.83 444.15 
401.20 408.66 415.23 421.02 426.10 430.36 436.47 441.55 445.64 448.83 
408.06 415.06 421.15 426.45 430.87 434.56 440.54 445.48 449.45 452.55 
414.03 420.55 426.15 430.77 434.62 437.83 443.67 448.49 452.37 455.42 
419.11 425.15 430.05 434.10 437.48 440.29 446.01 450.75 454.57 457.58 

greedy policy at iteration 1
 0  0  0  0  0 -1 -1 -2 -2 -2 
 0  0  0  0  0  0 -1 -1 -1 -2 
 0  0  0  0  0  0  0  0 -1 -1 
 0  0  0  0  0  0  0  0  0  0 
 1  1  0  0  0  0  0  0  0  0 
 2  1  1  0  0  0  0  0  0  0 
 2  2  1  1  0  0  0  0  0  0 
 3  2  2  1  