## Markov: Bellman Equation

In [4]:
import numpy as np

# state transition probability matrix
P = np.array([
  [0.9, 0.1, 0.0, 0.0, 0.0, 0.0],
  [0.5, 0.0, 0.5, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.6, 0.0, 0.4],
  [0.0, 0.0, 0.0, 0.0, 0.3, 0.7],
  [0.0, 0.2, 0.3, 0.5, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
])

# 到达每一个状态的奖励
R = np.array([-1, -2, -2, 10, 1, 0])

P, R

(array([[0.9, 0.1, 0. , 0. , 0. , 0. ],
        [0.5, 0. , 0.5, 0. , 0. , 0. ],
        [0. , 0. , 0. , 0.6, 0. , 0.4],
        [0. , 0. , 0. , 0. , 0.3, 0.7],
        [0. , 0.2, 0.3, 0.5, 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 1. ]]),
 array([-1, -2, -2, 10,  1,  0]))

In [5]:
# 给定一条序列，计算回报
def value_by_chain(chain):
  s = 0
  for i, c in enumerate(chain):
    # discounted return 
    s += R[c] * 0.5 ** i
    
  return s
  
value_by_chain(np.array([0, 1, 2, 5]))

-2.5

In [7]:
# 梯度下降计算贝尔曼矩阵
def get_bellman():
  # 初始化values
  value = np.ones([6])
  
  for _ in range(200):
    for i in range(6):
      #每一行的概率和它对应的value相乘，乘以gamma，然后和奖励相加
      #反复计算，就收敛到了贝尔曼方程矩阵
      value[i] = R[i] + 0.5 * P[i].dot(value)
  return value

get_bellman()

array([-2.01950168e+00, -2.21451846e+00,  1.16142785e+00,  1.05380928e+01,
        3.58728554e+00,  6.22301528e-61])

In [8]:
# 解析解贝尔曼方程
def get_bellman():
  mat = np.eye(*P.shape)
  mat -= 0.5 * P
  mat = np.linalg.inv(mat)
  
  return mat.dot(R)

get_bellman()

array([-2.01950168, -2.21451846,  1.16142785, 10.53809283,  3.58728554,
        0.        ])