Try defining a simple MDP environment in Python. For example, let's say we have:

* Two states: A and B
* Two actions: left and right

Transition probabilities:
* From A, left leads to A with probability 0.9 and to B with probability 0.1.
* From A, right leads to B with probability 1.0.
* From B, both actions lead back to B with probability 1.0.

Rewards: +1 when reaching B, 0 otherwise.

In [31]:
import random

In [320]:
class MDP:
    def __init__(self, params, terminal_states):
        self.params = params
        self.terminal_states = terminal_states

    def next_states(self, state, action):
        return self.params[state][action]

    def sample_next_state(self, state, action):
        transitions = self.next_states(state=state, action=action)
        states, probs, rewards = zip(*transitions)

        next_state_idx = random.choices(range(len(states)), weights=probs)[0]

        return states[next_state_idx], rewards[next_state_idx]

    def run_episode(self, start_state, max_steps=10):
        state = start_state
        steps = 0
        total_reward = 0
        while state not in self.terminal_states and steps < max_steps:
            action = random.choice(list(self.params[state].keys()))
            next_state, reward = self.sample_next_state(state, action)
            

            print(f'episode #{steps}')
            print(f'state={state}')
            print(f'action={action}')
            print(f'next_state={next_state} ; reward={reward}')
            print()

            total_reward += reward
            state = next_state
            steps += 1
    

        return total_reward
        

In [321]:
params = {
    'A': {
        'left': [('A', 0.9, 0), ('B', 0.1, 1)],
        'right': [('B', 1.0, 1)]
    },
    'B': {
        'left': [('B', 1.0, 1)],
        'right': [('B', 1.0, 1)]
    }
}

mdp = MDP(params, terminal_states=['B'])

In [326]:
mdp.run_episode('A')

episode #0
state=A
action=right
next_state=B ; reward=1



1

In [97]:
states[0]

'A'

In [101]:
mdp.n_states

2

In [26]:
states

('A', 'B')

In [27]:
probs

(0.9, 0.1)

In [28]:
rewards

(0, 1)

In [74]:
random.choices(states, weights=probs)

['A']