In [6]:
import numpy as np
import random

# Define states
states = ['Low', 'Medium', 'High', 'Shutdown']

# Define actions
actions = ['Reorder', 'DoNotOrder']

# Define transition probabilities and rewards
# Assuming transitions and rewards for simplicity. In practice, these would be based on the business context.
transition_probabilities = {
    'Low': {'Reorder': {'Low': 0.3, 'Medium': 0.4, 'High': 0.1, 'Shutdown': 0.2},
            'DoNotOrder': {'Low': 0.5, 'Medium': 0.1, 'High': 0.0, 'Shutdown': 0.4}},
    'Medium': {'Reorder': {'Low': 0.1, 'Medium': 0.2, 'High': 0.6, 'Shutdown': 0.1},
               'DoNotOrder': {'Low': 0.4, 'Medium': 0.4, 'High': 0.0, 'Shutdown': 0.2}},
    'High': {'Reorder': {'Low': 0.0, 'Medium': 0.2, 'High': 0.8, 'Shutdown': 0.0},
             'DoNotOrder': {'Low': 0.2, 'Medium': 0.5, 'High': 0.2, 'Shutdown': 0.1}},
    'Shutdown': {'Reorder': {'Low': 0.0, 'Medium': 0.0, 'High': 0.0, 'Shutdown': 1.0},
                 'DoNotOrder': {'Low': 0.0, 'Medium': 0.0, 'High': 0.0, 'Shutdown': 1.0}},
}

rewards = {
    'Low': {'Reorder': 5, 'DoNotOrder': -5},
    'Medium': {'Reorder': 1, 'DoNotOrder': 2},
    'High': {'Reorder': -2, 'DoNotOrder': 0},
    'Shutdown': {'Reorder': 0, 'DoNotOrder': 0}
}

# Initialize Q-table
Q = {}
for state in states:
    Q[state] = {}
    for action in actions:
        Q[state][action] = 0

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.99  # Initial exploration rate
min_epsilon = 0.01  # Minimum exploration rate
decay_rate = (epsilon - min_epsilon) / 1000  # Decay rate

# Number of episodes
episodes = 1000

# Q-learning algorithm
for episode in range(episodes):
    state = random.choice(states[:-1])  # Start with a random non-terminal state

    while state != 'Shutdown':
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)  # Explore action space
        else:
            action = max(Q[state], key=Q[state].get)  # Exploit learned values

        # Get next state based on transition probabilities
        next_state = np.random.choice(states, p=[transition_probabilities[state][action][s] for s in states])

        # Get reward for the current state-action pair
        reward = rewards[state][action]

        # Update Q-value
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * max(Q[next_state].values()) - Q[state][action])

        # Transition to the next state
        state = next_state

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon - decay_rate)

# Derive the optimal policy
policy = {}
for state in states[:-1]:  # Exclude terminal state
    policy[state] = max(Q[state], key=Q[state].get)

print("Optimal Policy:")
for state in policy:
    print(f"State: {state}, Action: {policy[state]}")

print("\nQ-Table:")
for state in Q:
    print(f"State: {state}, Q-values: {Q[state]}")

Optimal Policy:
State: Low, Action: Reorder
State: Medium, Action: Reorder
State: High, Action: DoNotOrder

Q-Table:
State: Low, Q-values: {'Reorder': 11.339793985120997, 'DoNotOrder': 3.092250843855944}
State: Medium, Q-values: {'Reorder': 6.959861864361998, 'DoNotOrder': 6.35397452872472}
State: High, Q-values: {'Reorder': 5.707805618081022, 'DoNotOrder': 7.558512575501913}
State: Shutdown, Q-values: {'Reorder': 0, 'DoNotOrder': 0}
