In [2]:
import numpy as np
import gym
from stable_baselines3 import DQN
from stable_baselines3.dqn import MlpPolicy

In [27]:
# Define the custom environment
class CustomEnv(gym.Env):
    def __init__(self, transition_model, reward_model, discount_factor):
        super(CustomEnv, self).__init__()
        self.transition_model = transition_model
        self.reward_model = reward_model
        self.discount_factor = discount_factor
        self.action_space = gym.spaces.Discrete(3)
        # Add a dummy observation space
        self.observation_space = gym.spaces.Discrete(4)
        self.state = 0

    def step(self, action):
        done = False
        next_state_probs = self.transition_model[action, self.state, :].flatten()
        next_state = np.random.choice(4, p=next_state_probs)
        reward = self.reward_model[action, self.state, next_state]

        if next_state == 3:
            done = True

        self.state = next_state
        # Return a dummy observation
        return self.state, reward, done, {}

    def reset(self):
        self.state = 0
        # Return a dummy observation
        return self.state

    def render(self, mode='human'):
        pass

In [34]:
# Set the parameter values
p = 1
q = 1
R = 100
C = 50
S = 200
K = 70
W = 1
C_1 = 40
U = 2
discount_factor = 0.99

In [35]:
# Define the transition model
transition_model = np.zeros((3, 4, 4))
transition_model[0, :, :] = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]])
transition_model[1, :, :] = np.array([[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
transition_model[2, :, :] = np.array([[0, 0, 1 - p*q, p*q], [0, 0, 1 - p*q, p*q], [0, 0, 1 - p*q, p*q], [0, 0, 0, 1]])

# Define the reward model
reward_model = np.zeros((3, 4, 4))
reward_model[0, :, :] = np.array([[R - C, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [- S, 0, 0, 0]])
reward_model[1, :, :] = np.array([[0, - K + R - C + W, 0, 0], [0, R - C + W, 0, 0], [0, R - C + W, 0, 0], [0, 0, 0, 0]])
reward_model[2, :, :] = np.array([[0, 0, - K + R - C_1 + W + U, - K - C_1 + W], [0, 0, R - C_1 + W + U, - C_1 + W], [0, 0, R - C_1 + W + U, - C_1 + W], [0, 0, 0, 0]])

In [36]:
# Create the custom environment
env = CustomEnv(transition_model, reward_model, discount_factor)

# Create the DQN agent
model = DQN(MlpPolicy, env, verbose=1, gamma=discount_factor)

# Train the agent
model.learn(total_timesteps=20000)

# Evaluate the trained agent
obs = env.reset()
for _ in range(100):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
        obs = env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.25     |
|    ep_rew_mean      | -58.5    |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2182     |
|    time_elapsed     | 0        |
|    total_timesteps  | 9        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.62     |
|    ep_rew_mean      | -33.5    |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3579     |
|    time_elapsed     | 0        |
|    total_timesteps  | 21       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.75     |
|    ep_rew_mean      | -29.2  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.5      |
|    ep_rew_mean      | -3.54    |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 24       |
|    fps              | 8090     |
|    time_elapsed     | 0        |
|    total_timesteps  | 84       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.32     |
|    ep_rew_mean      | -11.4    |
|    exploration_rate | 0.956    |
| time/               |          |
|    episodes         | 28       |
|    fps              | 8154     |
|    time_elapsed     | 0        |
|    total_timesteps  | 93       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.5      |
|    ep_rew_mean      | -6.19    |
|    exploration_rate | 0.947    |
| time/               |          |
|    episodes       

In [37]:
# Print the learned policy
print("Learned Policy:")
for state in range(4):
    action, _ = model.predict(np.array([state]), deterministic=True)
    print(f"State {state}: Action {action}")

# Close the environment
env.close()

Learned Policy:
State 0: Action [0]
State 1: Action [1]
State 2: Action [0]
State 3: Action [0]
