# Stay-Quit Game

![](imgs/stay-quit.png)

In [1]:
from mdp.model import MDP, Transition

In [2]:
alpha, r1, r2, r3 = 2/3, 4, 4, 10
conf = [
    Transition(0, 0, 0, alpha, r2),
    Transition(0, 0, 1, 1 - alpha, r1),
    Transition(0, 1, 1, 1, r3),
]

mdp = MDP(2, 2, config=conf, gamma=1)
mdp.set_state_names(IN=0, OUT=1)
mdp.set_action_names(stay=0, quit=1)

### Set a policy

In [3]:
from mdp.policy import Policy
import numpy as np 

In [4]:
class StayPolicy(Policy):
    def __getitem__(self, state: int) -> int:
        return 0

class QuitPolicy(Policy):
    def __getitem__(self, state: int) -> int:
        return 1

class RandomPolicy(Policy):
    def __getitem__(self, state: int) -> int:
        return np.random.randint(0, len(self.mdp.states))

In [5]:
stay = StayPolicy(mdp)
quit_p = QuitPolicy(mdp)
random_p = RandomPolicy(mdp)

#### Policy Evaluation

$$
V_t(s) = Q_{t-1}(s, \pi_s)
$$
$$
Q_{t-1}(s, \pi_s) = \sum\limits_{s'} T(s, a, s')\left[ reward(s, \pi_s, s') + \gamma  V_{t-1}(s')\right]
$$

In [6]:
from mdp.algorithms import policy_evaluation

In [7]:
print(policy_evaluation(policy=stay, mdp=mdp))
print(policy_evaluation(policy=quit_p, mdp=mdp))
print(policy_evaluation(policy=random_p, mdp=mdp))

[12.  0.]
[10.  0.]
[10.  0.]


## Value Iteration
$$
    V^{*}_{t(s)} = \max\limits_a Q^{*}_{t-1}(s, a)
$$
$$
    Q^{*}_{t-1}(s, a) = \sum\limits_{s'} T(s, a, s')\left[reward(s, a, s') + \gamma  V^{*}_{t-1}(s')\right]
$$

In [8]:
from mdp.algorithms import value_iteration

In [9]:
V_star, Pi_star, value_history, policy_history = value_iteration(mdp=mdp)

In [11]:
print(V_star, Pi_star)

[12.  0.] {0: 0, 1: None}
