In [1]:
import gymnasium as gym
import numpy as np
import matrix_pomdp

# Load Single Objective Environment

This example is based on the paper:

Corotis, R. B., Ellis, J. H., & Jiang, M. (2005). Modeling of risk-based inspection, maintenance, and life-cycle cost with partially observable Markov decision processes. Structure and Infrastructure Engineering, 1(1), 75–84. https://doi.org/10.1080/15732470412331289305

The bridge condition is enhanced through focused maintenance, while inspections refine the state estimation.

**Maintenance Actions**  
• P₁: No repair  
• P₂: Clean and repaint corroded surfaces  
• P₃: Repaint and strengthen girders  
• P₄: Extensive repair  

**Inspection (Observation) Actions**  
• O₁: No inspection  
• O₂: Visual inspection  
• O₃: Ultrasonic inspection  

**Combined Actions**  
At each decision point, the agent selects an action that combines maintenance and inspection into a single choice, effectively merging the repair strategy and inspection approach:  

• P₁ & O₁: No repair + No inspection  
• P₁ & O₂: No repair + Visual inspection  
• P₁ & O₃: No repair + Ultrasonic inspection  

• P₂ & O₁: Clean and repaint corroded surfaces + No inspection  
• P₂ & O₂: Clean and repaint corroded surfaces + Visual inspection  
• P₂ & O₃: Clean and repaint corroded surfaces + Ultrasonic inspection  

• P₃ & O₁: Repaint and strengthen girders + No inspection  
• P₃ & O₂: Repaint and strengthen girders + Visual inspection  
• P₃ & O₃: Repaint and strengthen girders + Ultrasonic inspection  

• P₄ & O₁: Extensive repair + No inspection  
• P₄ & O₂: Extensive repair + Visual inspection  
• P₄ & O₃: Extensive repair + Ultrasonic inspection





In [2]:
# This script sets up a Matrix POMDP environment with specific transition probabilities, rewards, and observation patterns.
# The environment is initialized with a given initial state distribution, transition matrices, rewards, and observation patterns.

# Define the initial state distribution
p_0 = np.array([0.2, 0.2, 0.2, 0.2, 0.2])

# Define the rewards for each state-action pair
# The rewards are structured in a way that each row corresponds to a state and each column corresponds to an action.
r = np.array([
    [    0,    -4,   -18,    -5,    -9,   -23,   -25,   -29,   -43,   -40,   -44,   -58],
    [    0,    -4,   -18,    -8,   -12,   -26,   -80,   -84,   -98,  -120,  -124,  -138],
    [    0,    -4,   -18,   -15,   -19,   -33,  -100,  -104,  -118,  -550,  -554,  -568],
    [ -300,  -304,  -318,  -320,  -324,  -338,  -450,  -454,  -468,  -800,  -804,  -818],
    [-2000, -2004, -2018, -2050, -2054, -2068, -2500, -2504, -2518, -4000, -4004, -4018]
])

# Define the transition matrices for each action
# In this case, we have four actions, each represented by a 5x5 matrix.

P1 = np.array([
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.00, 0.70, 0.17, 0.05, 0.08],
    [0.00, 0.00, 0.75, 0.15, 0.10],
    [0.00, 0.00, 0.00, 0.60, 0.40],
    [0.00, 0.00, 0.00, 0.00, 1.00]
])

P2 = np.array([
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.00, 0.80, 0.10, 0.02, 0.08],
    [0.00, 0.00, 0.80, 0.10, 0.10],
    [0.00, 0.00, 0.00, 0.60, 0.40],
    [0.00, 0.00, 0.00, 0.00, 1.00]
])

P3 = np.array([
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.19, 0.65, 0.08, 0.02, 0.06],
    [0.10, 0.20, 0.56, 0.08, 0.06],
    [0.00, 0.10, 0.25, 0.55, 0.10],
    [0.00, 0.00, 0.00, 0.00, 1.00]
])

P4 = np.array([
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.80, 0.13, 0.02, 0.00, 0.05],
    [0.80, 0.13, 0.02, 0.00, 0.05]
])

# Repeat each transition matrix three times to match the number of actions
p = np.array([*([P1]*3), *([P2]*3), *([P3]*3), *([P4]*3)])

# Define the observation patterns for each state
O1 = np.array([
    [1.0, 0.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 0.0, 0.0]
])

O2 = np.array([
    [0.80, 0.20, 0.00, 0.00, 0.00],
    [0.20, 0.60, 0.20, 0.00, 0.00],
    [0.05, 0.70, 0.25, 0.00, 0.00],
    [0.00, 0.30, 0.70, 0.00, 0.00],
    [0.00, 0.00, 1.00, 0.00, 0.00]
])

O3 = np.array([
    [0.90, 0.10, 0.00, 0.00, 0.00],
    [0.05, 0.90, 0.05, 0.00, 0.00],
    [0.00, 0.05, 0.90, 0.05, 0.00],
    [0.00, 0.00, 0.05, 0.90, 0.05],
    [0.00, 0.00, 0.00, 0.00, 1.00]
])

# Repeat each observation pattern four times to match the number of actions
o = np.array([*([O1, O2, O3]*4)])


In [6]:
# Create the Matrix POMDP environment with the defined parameters
env = gym.make("matrix_pomdp/MatrixPOMDP-v0", p_0=p_0, p=p, o=o, r=r)
env.reset( )

(array([0.2, 0.2, 0.2, 0.2, 0.2]), {})

In [7]:
env.step(0)  # Example step with action 0 do nothing

(array([0.16 , 0.166, 0.188, 0.16 , 0.326]),
 np.float64(-700.0000000000001),
 False,
 False,
 {})

In [8]:
env.step(9) # Example step with action 9 extensive repair and no inspection

(array([0.8 , 0.13, 0.02, 0.  , 0.05]),
 np.float64(-258.5999999999999),
 False,
 False,
 {})

In [6]:
# create a separate evaluation environment with the same parameters for evaluation purposes
eval_env = gym.make("matrix_pomdp/MatrixPOMDP-v0", p_0=p_0, p=p, o=o, r=r, true_reward=True)
eval_env.reset( )


(array([0.2, 0.2, 0.2, 0.2, 0.2]), {})

In [None]:
# Gives the true reward for the last action taken in the evaluation environment
eval_env.step(0)  # Example step with action 0 do nothing

(array([0.16 , 0.166, 0.188, 0.16 , 0.326]), np.int64(-300), False, False, {})

In [8]:
eval_env.step(9)  # Example step with action 9 extensive repair and no inspection

(array([0.8 , 0.13, 0.02, 0.  , 0.05]), np.int64(-800), False, False, {})

# Load Multi Objective Environment

For the multi objective enviroment we need to modify our reward matrix 

In [3]:
# reward matrix with two objectives 

r1 = np.array([
    [    0,    -4,   -18,    -5,    -9,   -23,   -25,   -29,   -43,   -40,   -44,   -58],
    [    0,    -4,   -18,    -8,   -12,   -26,   -80,   -84,   -98,  -120,  -124,  -138],
    [    0,    -4,   -18,   -15,   -19,   -33,  -100,  -104,  -118,  -550,  -554,  -568],
    [ -300,  -304,  -318,  -320,  -324,  -338,  -450,  -454,  -468,  -800,  -804,  -818],
    [-2000, -2004, -2018, -2050, -2054, -2068, -2500, -2504, -2518, -4000, -4004, -4018]
])

# Madeup rewards for the second objective
r2 = np.array([
    [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12],
    [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12],
    [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12],
    [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12],
    [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12]
])

# Combine the two reward matrices into a single array with two objectives
r = np.array([*(r1, r2)])

In [4]:
# Create the Matrix POMDP environment with the defined parameters
env = gym.make("matrix_pomdp/MatrixPOMDP-v0", p_0=p_0, p=p, o=o, r=r, multi_objective=True)
env.reset( )

(array([0.2, 0.2, 0.2, 0.2, 0.2]), {})

In [10]:
# It will give a vector of rewards for the last action taken in the environment
env.step(6)  # Example step with action 0 do nothing

(array([0.6667, 0.1925, 0.0376, 0.0042, 0.099 ]),
 array([-285.2175,   -7.    ]),
 False,
 False,
 {})