# Stay quit game
There are two states: `{in, end}`
From in you can stay or quit
- if you quit, you end up in end with probability 1 and reward 10 (default)
- if you stay:
    - you end up in end with probability 1/3 and reward 4 (default)
    - you end up in in with probability 2/3 and reward 4 (default)


In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
from stay_quit import StayQuitMDP
from models import Policy, StationaryPolicy
from utils import transitions_table, mdp_to_graph, plot_mdp

In [3]:
mdp = StayQuitMDP(gamma=.99, stay_in_prob=2/3, quit_reward=10)

In [4]:
T = transitions_table(mdp)

In [5]:
T

Unnamed: 0,from_state,action,to_state,reward,probability
0,IN,stay,IN,4.0,0.666667
1,IN,stay,END,4.0,0.333333
2,IN,quit,IN,0.0,0.0
3,IN,quit,END,10.0,1.0


## Show

In [6]:
from pyvis.network import Network
import pyvis

In [7]:
mg = mdp_to_graph(mdp)
for n1, n2, a in mg.edges(data=True):
    print(n1, n2, a)

0 2 {'type': 'state_action', 'action': 'stay', 'label': 'stay'}
0 3 {'type': 'state_action', 'action': 'quit', 'label': 'quit'}
2 0 {'type': 'transition', 'p': 0.6666666666666666, 'r': 4.0, 'label': '0.67,4.0'}
2 1 {'type': 'transition', 'p': 0.33333333333333337, 'r': 4.0, 'label': '0.33,4.0'}
3 1 {'type': 'transition', 'p': 1.0, 'r': 10, 'label': '1.0,10'}


In [8]:
net = plot_mdp(mg)
net.show('stay-quit-game.html')

stay-quit-game.html


## Policy

In [7]:
stay_policy = StationaryPolicy({'IN': 'stay'}, mdp=mdp)

In [10]:
episodes = [stay_policy.episode() for i in range(10)]

In [11]:
for episode in episodes:
    print(episode[0][0], " ".join(
        ["{} => {}".format(a[0], s_p) for s, a, s_p, r in episode]), 
          " U({})".format(stay_policy.utility(episode)))

IN s => IN s => IN s => IN s => IN s => IN s => IN s => END  U(27.173860837204)
IN s => IN s => IN s => END  U(11.8804)
IN s => IN s => IN s => IN s => IN s => IN s => IN s => IN s => IN s => IN s => END  U(38.247169996478206)
IN s => END  U(4.0)
IN s => IN s => IN s => END  U(11.8804)
IN s => IN s => END  U(7.96)
IN s => IN s => IN s => IN s => IN s => END  U(19.60398004)
IN s => IN s => IN s => IN s => IN s => IN s => END  U(23.4079402396)
IN s => END  U(4.0)
IN s => IN s => END  U(7.96)


## Policy evaluation

In [5]:
from algorithms import policy_evaluation

In [8]:
evaluation = policy_evaluation(policy=stay_policy, mdp=mdp, epsilon=1e-10)

In [29]:
evaluation

{'IN': 11.76470588208768, 'END': 0.0}

## Value iteration

In [37]:
from IPython.display import display, clear_output

In [38]:
from algorithms import value_iteration
from utils import show_value_iterations

In [39]:
optimal_value, optimal_policy, value_history, policy_history = value_iteration(mdp=mdp, epsilon=1e-10)

In [40]:
optimal_value, optimal_policy

({'IN': 11.764705882143245, 'END': 0.0}, {'IN': 'stay', 'END': None})

In [41]:
show_value_iterations(value_history, policy_history)

Unnamed: 0,S,V,A
0,IN,11.759454,stay
1,END,0.0,


quit


## Policy iteration

In [34]:
from algorithms import policy_iteration
from utils import show_policy_iterations

In [35]:
pi, pi_history = policy_iteration(mdp)

In [36]:
show_policy_iterations(pi_history)

Unnamed: 0,S,V,A
0,IN,11.764706,stay
1,END,0.0,


quit


In [None]:
pi.actions