In [43]:
import gym

import mdptoolbox, mdptoolbox.example

import time

In [2]:
env = gym.make('CartPole-v0')
env.reset()


array([ 0.02538519, -0.04439962, -0.01698877, -0.00198114])

In [3]:
for _ in range(1000):
    env.render()
    obs, rew, done, info = env.step(env.action_space.sample()) # take a random action
    if done:
        env.reset()
env.close()

In [4]:
import time
env = gym.make('CartPole-v0')
env.reset()
for i in range(500):
    env.render()
    env.step(env.action_space.sample()) # take a random action
    time.sleep(0.02)
    if i%10==0: print(i)
env.close()

0
10




20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


MDP Toolbox Example
https://pymdptoolbox.readthedocs.io/en/latest/api/mdp.html


- Creates transition probability P, size (Action X State X State)
- Creates Reward Matrix R, size (State X Action)
- Action is either WAIT (Action = 0) or CUT (Action = 1). There is some probability p that the fire burns the forest. 
- The states of the forest are the ages of how old the forest is since last cut or burn, where S = {0, 1, ..., S-1}
- 

In [5]:
# https://pymdptoolbox.readthedocs.io/en/latest/api/mdp.html

import mdptoolbox, mdptoolbox.example

#P, R = mdptoolbox.example.forest()
P, R = mdptoolbox.example.forest(S = 3, r1 = 4, r2 = 2, p = 0.1, is_sparse = False)

- S: The number of states, the number of years old the forest can be 
- r1: the reward when the forest is in its oldest state and action WAIT is performed 
- r2: the reward whne the forest is in its oldest state and action CUT is performed
- p: the probability that a wild fire occurs 

In [6]:
P

array([[[0.1, 0.9, 0. ],
        [0.1, 0. , 0.9],
        [0.1, 0. , 0.9]],

       [[1. , 0. , 0. ],
        [1. , 0. , 0. ],
        [1. , 0. , 0. ]]])

In [7]:
# Probability that if we take action 0, in a given state, which state we land in
P[0, :, :] 

array([[0.1, 0.9, 0. ],
       [0.1, 0. , 0.9],
       [0.1, 0. , 0.9]])

In [8]:
# Probability that if we take action 0, in a given state, which state we land in
P[1, :, :]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [9]:
R

array([[0., 0.],
       [0., 1.],
       [4., 2.]])

In [10]:
# If in a state, and take action 0, what is our reward
R[:, 0]

array([0., 0., 4.])

In [11]:
# If in the state, and take action 1, what is our reward
R[:, 1]

array([0., 1., 2.])

In [12]:
Psp, Rsp = mdptoolbox.example.forest(is_sparse=True)

len(Psp)

2

In [13]:
Psp[0]

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [14]:
Psp[1]

<3x3 sparse matrix of type '<class 'numpy.longlong'>'
	with 3 stored elements in Compressed Sparse Row format>

In [15]:
Rsp

array([[0., 0.],
       [0., 1.],
       [4., 2.]])

The Sparse representation of P and R is identical to the non sparse version

In [16]:
(Psp[0].todense() == P[0]).all()

True

In [17]:
(Rsp == R).all()

True

Use Policy Iteration on the Forest MDP
https://pymdptoolbox.readthedocs.io/en/latest/api/mdp.html

In [18]:
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9) # P = Transitions, R = Reward, 0.9 = Discount 
pi.run()

expected = (26.244000000000014, 29.484000000000016, 33.484000000000016)
all(expected[k] - pi.V[k] < 1e-12 for k in range(len(expected)))

True

In [19]:
pi.policy

(0, 0, 0)

In [20]:
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.2) # P = Transitions, R = Reward, 0.9 = Discount 
pi.run()

expected = (26.244000000000014, 29.484000000000016, 33.484000000000016)
all(expected[k] - pi.V[k] < 1e-12 for k in range(len(expected)))

pi.policy

(0, 1, 0)

Use Value Iteration on the Forest MDP https://pymdptoolbox.readthedocs.io/en/latest/api/mdp.html

In [21]:
vi = mdptoolbox.mdp.ValueIteration(P, R, 0.96)
vi.verbose

False

In [22]:
vi.run()

In [23]:
expected = (5.93215488, 9.38815488, 13.38815488)

all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected)))

vi.policy # Tuple shows which action maximizes the value in this state


(0, 0, 0)

In [24]:
vi.iter

4

Try 2 

In [25]:
vi = mdptoolbox.mdp.ValueIteration(P, R, 0.20) # Transition prob maatrix, reward matrix, then discount factor 
vi.verbose
vi.run()
print(vi.policy)
print(vi.iter)

(0, 1, 0)
3


#### Add more states to Random Forest

In [80]:
# https://pymdptoolbox.readthedocs.io/en/latest/api/mdp.html

import mdptoolbox, mdptoolbox.example

P, R = mdptoolbox.example.forest(S = 10, r1 = 4, r2 = 2, p = 0.1, is_sparse = False)

In [81]:
P

array([[[0.1, 0.9, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0.1, 0. , 0.9, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0.1, 0. , 0. , 0.9, 0. , 0. , 0. , 0. , 0. , 0. ],
        [0.1, 0. , 0. , 0. , 0.9, 0. , 0. , 0. , 0. , 0. ],
        [0.1, 0. , 0. , 0. , 0. , 0.9, 0. , 0. , 0. , 0. ],
        [0.1, 0. , 0. , 0. , 0. , 0. , 0.9, 0. , 0. , 0. ],
        [0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0.9, 0. , 0. ],
        [0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.9, 0. ],
        [0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.9],
        [0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.9]],

       [[1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ,

In [82]:
R

array([[0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [4., 2.]])

##### Policy Iteration with bigger state amount

In [83]:
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9) # P = Transitions, R = Reward, 0.9 = Discount 
pi.run()

expected = (26.244000000000014, 29.484000000000016, 33.484000000000016)
all(expected[k] - pi.V[k] < 1e-12 for k in range(len(expected)))

print("Optimal Policy",pi.policy)
print("Iters to Converge",pi.iter)


Optimal Policy (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Iters to Converge 9


##### Value Iteration with bigger state amount

In [87]:
vi = mdptoolbox.mdp.ValueIteration(P, R, 0.90)
print(vi.verbose)
vi.run()

expected = (5.93215488, 9.38815488, 13.38815488)
all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected)))

print("Optimal Policy",vi.policy) # Tuple shows which action maximizes the value in this state
print("Iters to Converge:",vi.iter)

False
Optimal Policy (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Iters to Converge: 16


##### Q Learning on Forest Problem

In [86]:
#P, R = mdptoolbox.example.forest(S = 10, r1 = 4, r2 = 2, p = 0.1, is_sparse = False)

ql = mdptoolbox.mdp.QLearning(P, R, 0.9)
ql.run()
#print(ql.Q)

print(ql.policy)

(0, 1, 0, 1, 0, 1, 0, 0, 1, 1)


##### Loop Through Different State Sizes and Compare Difference in Convergence Iterations

In [88]:
#state_size = [10, 100, 1000, 10000]
state_size = [5, 10, 20, 100, 1000]

for ss in state_size:
    
    P, R = mdptoolbox.example.forest(S = ss, r1 = 4, r2 = 2, p = 0.1, is_sparse = False)
    
    # Policy Iteration 
    pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9) # P = Transitions, R = Reward, 0.9 = Discount 
    pi.run()
    print()
    print("State size:", ss)
    #print("Optimal Policy",pi.policy)
    #print("Iters to Converge",pi.iter)
    #print("PI Time:",pi.time)
    #print(pi.V)
    #print(pi.policy)
    
    # Value Iteration 
    vi = mdptoolbox.mdp.ValueIteration(P, R, 0.9)
    vi.run()
    #print("Optimal Policy",vi.policy) # Tuple shows which action maximizes the value in this state
    #print("Iters to Converge:",vi.iter)
    #print("VI Time:",vi.time)
    #print(vi.V)
    #print(vi.policy)
    
    ql = mdptoolbox.mdp.QLearning(P, R, 0.9)
    ql.run()
    ql.Q
    #print(ql.V)
    #print(ql.policy)
    
    print("Did VI and PI Give the same policy?",vi.policy == pi.policy)
    print("QLearning and PI?",ql.policy == pi.policy)
    print("QLearning and VI",ql.policy == vi.policy)

    if pi.iter < vi.iter:
        print("PI requires fewer iterations to converge", pi.iter , " < ", vi.iter)
    else:
        print("VI requires fewer iterations to converge", vi.iter , " < ", pi.iter)
    
    if pi.time < vi.time:
        print("PI is faster to converge", pi.time , " < ", vi.time)
    else:
        print("VI is faster to converge", vi.time , " < ", pi.time)
    
    if pi.V[-1] > vi.V[-1]:
        print("PI has higher reward", pi.V[-1] , " > ", vi.V[-1])
    else:
        print("VI has higher reward", vi.V[-1] , " > ", pi.V[-1])
    print(pi.V[-1])
    print(vi.V[-1])
    print(ql.V[-1])

   


State size: 5
Did VI and PI Give the same policy? True
QLearning and PI? False
QLearning and VI False
PI requires fewer iterations to converge 4  <  6
VI is faster to converge 0.0015799999237060547  <  0.0022399425506591797
PI has higher reward 29.208852400000016  >  15.482564617000001
29.208852400000016
15.482564617000001
2.1289947463987993

State size: 10
Did VI and PI Give the same policy? True
QLearning and PI? False
QLearning and VI False
PI requires fewer iterations to converge 9  <  16
VI is faster to converge 0.0008697509765625  <  0.0036683082580566406
PI has higher reward 23.89652993194315  >  21.664850965110947
23.89652993194315
21.664850965110947
5.0592475028361115

State size: 20
Did VI and PI Give the same policy? True
QLearning and PI? False
QLearning and VI False
PI requires fewer iterations to converge 10  <  39
VI is faster to converge 0.004132747650146484  <  0.006227970123291016
PI has higher reward 23.172433847048566  >  23.089675091923866
23.172433847048566
23.08

#### Gridworld problem from Open Gym

- S: Initial State
- F: Frozen Lake
- H: Hole
- G: The Goal
- Red Square: Current Position

In [89]:
# Create the Frozen Lake Environment 
env = gym.make('FrozenLake-v0')

# Put into initial state
env.reset()

# Print the State
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [94]:
print("Action space:", env.action_space)

print("Observation Space:",env.observation_space)

Action space: Discrete(4)
Observation Space: Discrete(16)


In [95]:
# Dummy to randomly play Frozen Lake

MAX_ITERATIONS = 10
 
env = gym.make("FrozenLake-v0")
env.reset()
env.render()
for i in range(MAX_ITERATIONS):
    random_action = env.action_space.sample()
    new_state, reward, done, info = env.step(
       random_action)
    env.render()
    if done:
        break


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
