In [2]:
import numpy as np
import scipy.optimize as optimize

# Parameters
N = 12 # Time periods
S = 5 # Number of states (demand levels)
A = 3 # Number of actions (price levels)
gamma = 1 # Discount factor

# Transition probabilities: P[s, a, s'] = Probability of transition from s to s' given action a
P = np.random.rand(S, A, S) # Example Probabilities, should be based on data

# Reward Function: R[s, a] = Expected revenue for action a in state s
R = np.random.rand(S, A) # Example rewards, should be based on data

# Value function and policy initialization
V = np.zeros((N+1, S))
policy = np.zeros((N, S), dtype=int)

# Dynamic programming to solve the MDP
for t in range(N-1, -1, -1):
  for s in range(S):
    Q = np.zeros(A)
    for a in range(A):
      Q[a] = R[s, a] + gamma * np.sum(P[s, a, :] * V[t+1, :])
    V[t, s] = np.max(Q)
    policy[t, s] = np.argmax(Q)

print("Optimal Policy (Price levels per State and Time):")
print(policy)

Optimal Policy (Price levels per State and Time):
[[1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [1 1 0 1 1]
 [0 1 0 2 1]
 [0 1 1 2 1]]


# Output
- Each row corresponds to a specific time period.
- Each column corresponds to different levels of demand.
- 0 might represent the lowest price level and 1 could represent a medium price level and 2 might represent the highest price level