In [111]:
import numpy as np

np.set_printoptions(precision=4, suppress=True)

In [112]:
X = np.array([0, 1, 2])  # state space (low/moderate/high)
U = np.array([0, 1])  # action space (buy/sell)

# Construct state space, action, probability matrix,
# depth index is ACTION, Row index is state FROM, col index is state TO
P = np.array([
    [
        [0.2, 0.8, 0.0],
        [0.0, 0.2, 0.8],
        [0.0, 0.2, 0.8]
    ],
    [
        [0.8, 0.2, 0.0],
        [0.8, 0.2, 0.0],
        [0.0, 0.8, 0.2]
    ]
])

# Immediate rewards, rows are states, cols are actions
G = np.array([
    [4, -1],
    [3, -1],
    [2, -1]
])

gamma = 0.5  # discount factor

def update_V(V, gamma=0.5):
    # print("V.shape =", V.shape)
    policy = []
    V_next = V.copy()
    for x in range(V.shape[0]):
        future_expected_reward = (P[:, x, :] @ V)  # shape (2,)
        opt_u = np.argmax(gamma * future_expected_reward + G[x, :])
        policy.append(opt_u)
        V_next[x] = G[x, opt_u] + gamma * future_expected_reward[opt_u]
    print("    V_next =", V_next)
    return V_next, policy

def fixed_policy_update_V(V, policy_dict, gamma=0.5):
    # print("V.shape =", V.shape)
    V_next = V.copy()
    for x in range(V.shape[0]):
        future_expected_reward = (P[:, x, :] @ V)  # shape (2,)
        fixed_u = policy_dict[x]
        V_next[x] = G[x, fixed_u] + gamma * future_expected_reward[fixed_u]
    print("    V_next =", V_next)
    return V_next

def value_iteration(num_states, gamma=0.5, epsilon=1e-4):
    V_old = np.zeros(num_states)
    V = update_V(V_old, gamma)
    t = 1
    print(f"V_old at t={t}:", V_old)
    print(f"V at t={t}:", V)
    while np.abs(V_old - V).max() > epsilon:
        V_old = V.copy()
        V = update_V(V, gamma)
        t += 1
        print(f"V_old at t={t}:", V_old)
        print(f"V at t={t}:", V)
    return V

def fixed_policy_value_iteration(policy_dict, num_states, gamma=0.5, epsilon=1e-4):
    V_old = np.zeros(num_states)
    V = fixed_policy_update_V(V_old, policy_dict, gamma)
    t = 1
    print(f"V_old at t={t}:", V_old)
    print(f"V at t={t}:", V)
    while np.abs(V_old - V).max() > epsilon:
        V_old = V.copy()
        V = fixed_policy_update_V(V, policy_dict, gamma)
        t += 1
        print(f"V_old at t={t}:", V_old)
        print(f"V at t={t}:", V)
    return V

num_states = len(X)
# value_iteration(num_states)

policy_dict = {
    0:0,  # L: BUY
    1:0,  # M: BUY
    2:1   # H: SELL
}

fixed_policy_value_iteration(policy_dict, num_states)

    V_next = [ 4.  3. -1.]
V_old at t=1: [0. 0. 0.]
V at t=1: [ 4.  3. -1.]
    V_next = [5.6 2.9 0.1]
V_old at t=2: [ 4.  3. -1.]
V at t=2: [5.6 2.9 0.1]
    V_next = [5.72 3.33 0.17]
V_old at t=3: [5.6 2.9 0.1]
V at t=3: [5.72 3.33 0.17]
    V_next = [5.904 3.401 0.349]
V_old at t=4: [5.72 3.33 0.17]
V at t=4: [5.904 3.401 0.349]
    V_next = [5.9508 3.4797 0.3953]
V_old at t=5: [5.904 3.401 0.349]
V at t=5: [5.9508 3.4797 0.3953]
    V_next = [5.987  3.5061 0.4314]
V_old at t=6: [5.9508 3.4797 0.3953]
V at t=6: [5.987  3.5061 0.4314]
    V_next = [6.0011 3.5232 0.4456]
V_old at t=7: [5.987  3.5061 0.4314]
V at t=7: [6.0011 3.5232 0.4456]
    V_next = [6.0094 3.5305 0.4538]
V_old at t=8: [6.0011 3.5232 0.4456]
V at t=8: [6.0094 3.5305 0.4538]
    V_next = [6.0132 3.5346 0.4576]
V_old at t=9: [6.0094 3.5305 0.4538]
V at t=9: [6.0132 3.5346 0.4576]
    V_next = [6.0151 3.5365 0.4596]
V_old at t=10: [6.0132 3.5346 0.4576]
V at t=10: [6.0151 3.5365 0.4596]
    V_next = [6.0161 3.5375 0.4

array([6.017 , 3.5384, 0.4615])

In [113]:
T = np.array([
    [0.1, 0.4, 0],
    [0, 0.1, 0.4],
    [0, 0.4, 0.1]
])

b = np.array([4, 3, -1])[:, np.newaxis]

V_mu_0 = (np.linalg.inv(np.eye(T.shape[0]) - T) @ b).flatten()
print(V_mu_0)

V_new, policy = update_V(V_mu_0)
print(V_new)
print(policy)

[6.0171 3.5385 0.4615]
    V_next = [6.0171 3.5385 2.5385]
[6.0171 3.5385 2.5385]
[0, 0, 0]
