---

Consider the following game:

In each round you choose to stay or quit. If you quit, you get 100 Rupees and we end the game. If you decide to stay, you get 40 Rupees and you roll a die. If the die shows up 1 or 2, the game ends. Otherwise it continues.

Simulate the game for different policies (that is, what action you take among stay/quit), generate the reward sequence.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [None]:
# Class for the die game Markov Decision Process (MDP)
class DieGameMDP() :

  def __init__ (self) :
    # state space of the MDP
    self.states = [0, 1] # list of states : 0 = in the game and 1 = out of the game

  # starting state of the MDP
  def startState (self) :
    return (self.states [0])

  # check if the MDP has ended
  def isEnd (self, state) :
    return (state == self.states [1]) # return true meaning you are out of the game

  # possible actions from a given state
  def actions (self, state) :
    action_list = []
    action_list.append ('stay')
    action_list.append('quit')
    return (action_list)

  # return the list of possible new states, the corresponding probabilities and the corresponding rewards
  def NewStateProbReward (self, state, action) : # for the agent, what is the current state and what is the action
    newStateProbReward_list = []
    if action in self.actions (state) :
      if state == self.states [0] :
        if action == 'stay' :
          newStateProbReward_list.append ((state, 2/3, 40))
          newStateProbReward_list.append ((state + 1, 1/3, 40))
        elif action == 'quit' :
          newStateProbReward_list.append ((state + 1, 1, 100))
      else :
        newStateProbReward_list.append ((state, 1, 0)) # terminal state
    return (newStateProbReward_list)

  # reward discounting
  def discount (self) : # (gamma itself)
    return 0.95 # when made 0.9, discount will be applied

In [None]:
# create the dice game MDP object
dieGameMDP = DieGameMDP ()

In [None]:
# what are the possible actions available from state 0 which is the 'in' state?
dieGameMDP.actions (0)

['stay', 'quit']

In [None]:
# what are the possible actions available from state 1 which is the 'out' state ?
dieGameMDP.actions (1) # ALREADY OUT OF THE GAME SO NOTHING but coding is easier this way

['stay', 'quit']

In [None]:
# interpret what are the possible (new state, corresponding probability, corresponding reward)
# available from state 0 (the 'in' state) and action 'stay'
dieGameMDP.NewStateProbReward (0, 'stay')
# can be in the game with prob 2/3 and reward 40 OR can be out of the game with prob 1/3 and reward 40

[(0, 0.6666666666666666, 40), (1, 0.3333333333333333, 40)]

In [None]:
dieGameMDP.NewStateProbReward (0, 'quit')

[(1, 1, 100)]

In [None]:
dieGameMDP.NewStateProbReward (1, 'stay')

[(1, 1, 0)]

In [None]:
dieGameMDP.NewStateProbReward (1, 'quit')

[(1, 1, 0)]

In [None]:
# a deterministic policy (what action to take in a given state ?)
# this case is always stay
def policy1 (dieGameMDP, state) :
  # return ('stay')
  return (dieGameMDP.actions (state) [0])

In [None]:
np.random.choice ([0, 1], size = 1, p = [2/3, 1/3]) # out of 100 approximately, 0 will show up 66 times and 1 will show up 34 times

array([1])

In [None]:
range(2)

range(0, 2)

In [None]:
# deterministic policy
def policy2 (dieGameMDP, state) :
  return (dieGameMDP.actions (state) [1])

In [None]:
# non deterministic policy
def policy3 (dieGameMDP, state) :
  return (np.random.choice (dieGameMDP.actions (0), size = 1, p = [0.8, 0.2]))

In [None]:
# generate the reward sequence using the policy
G = 0 # cumulative reward intially 0
k = 0 # represents the timestamp (power of gamma)
state = dieGameMDP.startState () # start state

while not dieGameMDP.isEnd (state) :

  action = policy1 (dieGameMDP, state) # action to be taken at the current state
  SPR_list = dieGameMDP.NewStateProbReward (state, action) # possible new states, corresponding probabilities and corresponding rewards
  prob = [tuple[1] for tuple in SPR_list] # transition probabilities
  index = np.random.choice(range (len (SPR_list)), size = 1, p = prob)[0]
  newState = SPR_list [index][0] # new state
  reward = SPR_list [index][2] # reward

  print(f'Current State = {state}, Action = {action}, New State = {newState}, Reward = {reward}')

  state = newState # update the state to new state
  G = G + (dieGameMDP.discount ()) ** k * reward # update the cumulative reward
  k = k + 1

print(f'Net discounted Reward = {G}')

Current State = 0, Action = stay, New State = 0, Reward = 40
Current State = 0, Action = stay, New State = 1, Reward = 40
Net discounted Reward = 78.0
