# Stay quit game
There are two states: `{in, end}`
From in you can stay or quit
- if you quit, you end up in end with probability 1 and reward 10 (default)
- if you stay:
    - you end up in end with probability 1/3 and reward 4 (default)
    - you end up in in with probability 2/3 and reward 4 (default)


In [15]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [16]:
from models import StayQuitMDP, Policy
from utils import transitions_table

In [29]:
mdp = StayQuitMDP(gamma=.99)

In [30]:
T = transitions_table(mdp)

In [31]:
T

Unnamed: 0,from_state,action,to_state,reward,probability
0,IN,stay,IN,4.0,0.666667
1,IN,stay,END,4.0,0.333333
2,IN,quit,IN,,0.0
3,IN,quit,END,10.0,1.0


## Policy

In [32]:
stay_policy = Policy({'IN': 'stay'}, mdp=mdp)

In [33]:
episodes = [stay_policy.episode() for i in range(10)]

In [34]:
for episode in episodes:
    print(episode[0][0], " ".join(
        ["{} => {}".format(a[0], s_p) for s, a, s_p, r in episode]), 
          " U({})".format(stay_policy.utility(episode)))

IN s => END  U(4.0)
IN s => IN s => END  U(7.96)
IN s => IN s => IN s => IN s => END  U(15.761596)
IN s => END  U(4.0)
IN s => END  U(4.0)
IN s => IN s => END  U(7.96)
IN s => IN s => END  U(7.96)
IN s => END  U(4.0)
IN s => IN s => END  U(7.96)
IN s => END  U(4.0)


## Policy evaluation

In [35]:
from algorithms import policy_evaluation

In [36]:
evaluation = policy_evaluation(policy=stay_policy, mdp=mdp, epsilon=1e-10)

In [37]:
evaluation

{'IN': 11.76470588208768, 'END': 0.0}

## Value iteration

In [38]:
from algorithms import value_iteration

In [39]:
optimal_value = value_iteration(mdp=mdp, epsilon=1e-10)

In [40]:
optimal_value

{'IN': (11.76470588208768, 'stay'), 'END': (0.0, None)}