---

Consider a self-powered rover that operates on a slope. The rover can be in one of the following four states: (1) low (2) medium (3) high (4) top. The rover has a motor that can spin its wheel slowly at the expense of 4 unit of energy per time step or rapidly at the expense of 12 units of energy per time step. If the motor spins the wheel slowly, with probability 0.3 it moves to the next higher state in one time step, and with probability 0.7, it slides all the way down the slope to the low state. On the other hand, if the motor spins the wheel rapidly, with probability 0.5 it moves to the next higher state in one time step, and with probability 0.5, it slides all the way down the slope to the low state. The rover’s motion terminates once it reaches the top state. The rover is initially low on the slope and aims to reach the top with minimum energy consumption. Build an MDP class to simulate the reward sequence for the rover.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [None]:
# Class for the rover Markov Decision Process (MDP)
class RoverMDP:

    def __init__(self):
        # State space of the MDP
        self.states = ["low", "medium", "high", "top"]

    # Starting state of the MDP
    def startState(self):
        return self.states[0]

    # Check if the MDP has ended
    def isEnd(self, state):
        return state == self.states[3]

    # Possible actions from a given state
    def actions(self, state):
        return ["slow", "rapid"]

    # Return the list of possible new states, the corresponding probabilities, and the corresponding rewards
    def NewStateProbReward(self, state, action):
        newStateProbReward_list = []

        if self.isEnd(state):
            # Terminal state, no transitions
            return [(state, 1.0, 0)]

        if action in self.actions(state):
            if state == "low":
                if action == "slow":
                    newStateProbReward_list.append(("low", 0.7, -4))  # Slides back to low
                    newStateProbReward_list.append(("medium", 0.3, -4))  # Moves to medium
                elif action == "rapid":
                    newStateProbReward_list.append(("low", 0.5, -12))  # Slides back to low
                    newStateProbReward_list.append(("medium", 0.5, -12))  # Moves to medium

            elif state == "medium":
                if action == "slow":
                    newStateProbReward_list.append(("low", 0.7, -4))  # Slides back to low
                    newStateProbReward_list.append(("high", 0.3, -4))  # Moves to high
                elif action == "rapid":
                    newStateProbReward_list.append(("low", 0.5, -12))  # Slides back to low
                    newStateProbReward_list.append(("high", 0.5, -12))  # Moves to high

            elif state == "high":
                if action == "slow":
                    newStateProbReward_list.append(("low", 0.7, -4))  # Slides back to low
                    newStateProbReward_list.append(("top", 0.3, -4))  # Moves to top
                elif action == "rapid":
                    newStateProbReward_list.append(("low", 0.5, -12))  # Slides back to low
                    newStateProbReward_list.append(("top", 0.5, -12))  # Moves to top

        return newStateProbReward_list

    # Reward discounting
    def discount(self):
        return 0.95

In [None]:
# create the dice game MDP object
roverMDP = RoverMDP ()

In [None]:
print(roverMDP.states)
print(roverMDP.actions)

['low', 'medium', 'high', 'top']
<bound method RoverMDP.actions of <__main__.RoverMDP object at 0x78199428df90>>


In [None]:
# what are the possible actions available from state 0 which is the 'in' state?
roverMDP.actions (0)

['slow', 'rapid']

In [None]:
# what are the possible actions available from state 3 which is the 'out' state ?
roverMDP.actions (3) # ALREADY OUT OF THE GAME SO NOTHING but coding is easier this way

['slow', 'rapid']

In [None]:
# interpret what are the possible (new state, corresponding probability, corresponding reward)
# available from state the 'low' state and action 'slow'
roverMDP.NewStateProbReward ('low', 'slow')

[('low', 0.7, -4), ('medium', 0.3, -4)]

In [None]:
roverMDP.NewStateProbReward ('low', 'rapid')

[('low', 0.5, -12), ('medium', 0.5, -12)]

In [None]:
roverMDP.NewStateProbReward ('medium', 'slow')

[('low', 0.7, -4), ('high', 0.3, -4)]

In [None]:
roverMDP.NewStateProbReward ('medium', 'rapid')

[('low', 0.5, -12), ('high', 0.5, -12)]

In [None]:
roverMDP.NewStateProbReward ('high', 'slow')

[('low', 0.7, -4), ('top', 0.3, -4)]

In [None]:
roverMDP.NewStateProbReward ('high', 'rapid')

[('low', 0.5, -12), ('top', 0.5, -12)]

In [None]:
roverMDP.NewStateProbReward ('top', 'slow')

[('top', 1.0, 0)]

In [None]:
roverMDP.NewStateProbReward ('top', 'rapid')

[('top', 1.0, 0)]

In [None]:
# a deterministic policy (what action to take in a given state ?)
# this case is if in states low or medium then choose rapid and if in state high then choose slow
def policy(state):
   if state in ['low', 'medium']:
        return 'rapid'
   elif state in ['high', 'top']:
        return 'slow'

In [None]:
np.random.choice ([0, 1], size = 1, p = [2/3, 1/3]) # out of 100 approximately, 0 will show up 66 times and 1 will show up 34 times

array([1])

In [None]:
range(2)

range(0, 2)

In [None]:
# Initialize variables
G = 0  # Cumulative reward
k = 0  # Timestamp (power of gamma)
state = roverMDP.startState()  # Starting state

# Generate reward sequence
while not roverMDP.isEnd(state):
    # Choose action based on the policy
    action = policy(state)

    # Get possible new states, probabilities, and rewards
    SPR_list = roverMDP.NewStateProbReward(state, action)  # [(newState, prob, reward), ...]

    # Extract transition probabilities
    prob = [tup[1] for tup in SPR_list]

    # Randomly select a new state based on transition probabilities
    index = np.random.choice(range(len(SPR_list)), size=1, p=prob)[0]
    newState, reward = SPR_list[index][0], SPR_list[index][2]  # New state and reward

    # Print the step details
    print(f'Current State = {state}, Action = {action}, New State = {newState}, Reward = {reward}')

    # Update cumulative reward and state
    G += (roverMDP.discount()) ** k * reward
    state = newState
    k += 1  # Increment the time step

# Print the net discounted reward
print(f'Net Discounted Reward = {G}')

Current State = low, Action = rapid, New State = medium, Reward = -12
Current State = medium, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, New State = medium, Reward = -12
Current State = medium, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, New State = medium, Reward = -12
Current State = medium, Action = rapid, New State = high, Reward = -12
Current State = high, Action = slow, New State = low, Reward = -4
Current State = low, Action = rapid, New State = medium, Reward = -12
Current State = medium, Action = rapid, New State = high, Reward = -12
Current State = high, Action = slow, New State = low, Reward = -4
Current State = low, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, New State = low, Reward = -12
Current State = low, Action = rapid, N