Some MDP examples from [MDP_Toolbox](https://pymdptoolbox.readthedocs.io/en/latest/api/example.html#module-mdptoolbox.example)

In [None]:
pip install pymdptoolbox


In [None]:
import mdptoolbox.example
P, R = mdptoolbox.example.forest()
# print('transition matrix P:\n' , P)
# print('reward matrix R:\n' , R)


discount = 0.9                                     # discount factor
vi = mdptoolbox.mdp.ValueIteration(P, R, discount) # applies the value iteration algorithm to solve a discounted MDP
vi.run()
print('Policy: ' , vi.policy) # result is (0, 0, 0)
print('Value Function:' , vi.V)    # value function
print('Number of Iterations:', vi.iter) # number of iterations
print('Time Taken:' , vi.time)          # time taken to run the algorithm

NotImplementedError: You should create a run() method.

In [None]:
import numpy, mdptoolbox.example
numpy.random.seed(0)                    # Needed to get consistent output between devices
P, R = mdptoolbox.example.rand(4, 3)    # 4 states, 3 actions
print('transition matrix P:\n' , P)
print('reward matrix R:\n' , R.shape)   # 3D array (A X S X S)


transition matrix P:
 [[[0.21977283 0.14889403 0.30343592 0.32789723]
  [1.         0.         0.         0.        ]
  [0.         0.43718772 0.54480359 0.01800869]
  [0.39766289 0.39997167 0.12547318 0.07689227]]

 [[1.         0.         0.         0.        ]
  [0.32261337 0.15483812 0.32271303 0.19983549]
  [0.33816885 0.2766999  0.12960299 0.25552826]
  [0.41299411 0.         0.58369957 0.00330633]]

 [[0.32343037 0.15178596 0.28733094 0.23745272]
  [0.36348538 0.24483321 0.16114188 0.23053953]
  [1.         0.         0.         0.        ]
  [0.         0.         1.         0.        ]]]
reward matrix R:
 (3, 4, 4)


MDP from [Stanford Lecture](https://web.stanford.edu/class/cme241/lecture_slides/david_silver_slides/MDP.pdf)

In [40]:
import numpy as np

# 7 state student markov reward process
States= ['Facebook', 
         'Class 1', 
         'Class 2', 
         'Class 3', 
         'Passed', 
         'Sleep', 
         'Pub']

P = np.array([
    [0.9, 0.1, 0, 0, 0, 0, 0], 
    [0.5, 0, 0.5, 0, 0, 0, 0], 
    [0, 0, 0, 0.8, 0, 0.2, 0], 
    [0, 0, 0, 0, 0.6, 0, 0.4], 
    [0, 0, 0, 0, 0, 1, 0], 
    [0, 0, 0, 0, 0, 0, 0], 
    [0, 0.2, 0.4, 0.4, 0, 0, 0]])

R = np.array([[-1, -2, -2, -2, 10, 0, 1]])

discount = 0.5

potential_sequence = [1, 2, 3, 4, 5]               # should be -2.25
potential_sequence = [1, 0, 0, 1, 2, 5]            # should be -3.125
potential_sequence = [1 , 2, 3, 6, 2, 3, 4, 5]     # should be -3.41
potential_sequence = [1, 0, 0, 1, 2, 3, 6, 1, 2, 3, 4, 5]   # should be -3.20
# potential_sequence = [0, 0, 0, 1, 2, 3, 6, 2, 5]   # should be -2.18

# Calculate the immediate and discounted reward for the potential sequence
immediate_reward = R[0, potential_sequence[0]]
print('immediate reward:', immediate_reward)
discounted_reward = 0

for step in range(len(potential_sequence)-1):
    discounted_reward += discount ** (step+1) * R[0, potential_sequence[step+1]]
print('discounted reward:', discounted_reward)

value_function = immediate_reward + discounted_reward
print('the value of this sequence is:', value_function)    

immediate reward: -2
discounted reward: -1.189453125
the value of this sequence is: -3.189453125
