In [71]:
import numpy as np

np.set_printoptions(precision=4, suppress=True)

In [76]:
X = np.array([0, 1, 2])  # state space (low/moderate/high)
U = np.array([0, 1])  # action space (buy/sell)

# Construct state space, action, probability matrix,
# depth index is ACTION, Row index is state FROM, col index is state TO
P = np.array([
    [
        [0.2, 0.8, 0.0],
        [0.0, 0.2, 0.8],
        [0.0, 0.2, 0.8]
    ],
    [
        [0.8, 0.2, 0.0],
        [0.8, 0.2, 0.0],
        [0.0, 0.8, 0.2]
    ]
])

# Immediate rewards, rows are states, cols are actions
G = np.array([
    [4, -1],
    [3, -1],
    [2, -1]
])

gamma = 0.5  # discount factor

def update_V(V, gamma=0.5):
    # print("V.shape =", V.shape)
    V_next = V.copy()
    for x in range(V.shape[0]):
        future_expected_reward = (P[:, x, :] @ V)  # shape (2,)
        opt_u = np.argmax(future_expected_reward + G[x, :])
        V_next[x] = G[x, opt_u] + gamma * future_expected_reward[opt_u]
    print("    V_next =", V_next)
    return V_next

def value_iteration(num_states, gamma=0.5, epsilon=1e-4):
    V_old = np.zeros(num_states)
    V = update_V(V_old, gamma)
    t = 1
    print(f"V_old at t={t}:", V_old)
    print(f"V at t={t}:", V)
    while np.abs(V_old - V).max() > epsilon:
        V_old = V.copy()
        V = update_V(V)
        t += 1
        print(f"V_old at t={t}:", V_old)
        print(f"V at t={t}:", V)
    return V

num_states = len(X)
value_iteration(num_states)

    V_next = [4. 3. 2.]
V_old at t=1: [0. 0. 0.]
V at t=1: [4. 3. 2.]
    V_next = [5.6 4.1 3.1]
V_old at t=2: [4. 3. 2.]
V at t=2: [5.6 4.1 3.1]
    V_next = [6.2  4.65 3.65]
V_old at t=3: [5.6 4.1 3.1]
V at t=3: [6.2  4.65 3.65]
    V_next = [6.48  4.925 3.925]
V_old at t=4: [6.2  4.65 3.65]
V at t=4: [6.48  4.925 3.925]
    V_next = [6.618  5.0625 4.0625]
V_old at t=5: [6.48  4.925 3.925]
V at t=5: [6.618  5.0625 4.0625]
    V_next = [6.6868 5.1312 4.1312]
V_old at t=6: [6.618  5.0625 4.0625]
V at t=6: [6.6868 5.1312 4.1312]
    V_next = [6.7212 5.1656 4.1656]
V_old at t=7: [6.6868 5.1312 4.1312]
V at t=7: [6.7212 5.1656 4.1656]
    V_next = [6.7384 5.1828 4.1828]
V_old at t=8: [6.7212 5.1656 4.1656]
V at t=8: [6.7384 5.1828 4.1828]
    V_next = [6.747  5.1914 4.1914]
V_old at t=9: [6.7384 5.1828 4.1828]
V at t=9: [6.747  5.1914 4.1914]
    V_next = [6.7513 5.1957 4.1957]
V_old at t=10: [6.747  5.1914 4.1914]
V at t=10: [6.7513 5.1957 4.1957]
    V_next = [6.7534 5.1979 4.1979]
V_ol

array([6.7555, 5.1999, 4.1999])

In [69]:
V_old = np.array([0, 0, 0])
V = np.array([4, 3, 2])
V_old - V

array([-4, -3, -2])