In [135]:
import numpy as np

class GamblerEnvironment():
    
    def __init__(self, p_H=0.5, goal=100):
        self.p_H = p_H
        self.goal = goal
        self.states = [i for i in range(1, goal)]
    
    def actions(self, state):
        assert state in self.states
        return [i for i in range(min(state, self.goal-state) + 1)]
    
    def transitions(self, state, action):
        return [{
            "probability": self.p_H,
            "state": state + action,
            "reward": 1 if state + action == self.goal else 0,
            "done": state + action == self.goal,
        }, {
            "probability": (1-self.p_H),
            "state": state - action,
            "reward": 0,
            "done": state - action == 0,
        }]
    
def q_fn(env, value, state, discount_factor):
    return [
        discount_factor * sum([
            transition["reward"] + (0 if transition["done"] else transition["probability"] * value[transition["state"]]) \
            for transition in env.transitions(state, action)
        ]) for action in env.actions(state)
    ]
    
def policy_evaluation(policy, env, theta=0.00001, discount_factor=1.0):
    V = np.zeros(env.goal)
    new_V = np.zeros(env.goal)
    distance = 1
    while distance > theta:
        for state in env.states:
            actions = env.actions(state)
            q = q_fn(env, V, state, discount_factor)
            new_V[state] = sum([
                policy[state, action] * q[action] for action in env.actions(state)
            ])
        
        distance = np.linalg.norm(V - new_V)
        V = new_V.copy()        
        
    return np.array(V)

In [136]:
def random_policy(env):
    policy = -1 * np.ones([env.goal, env.goal // 2 + 1])
    for state in env.states:
        actions = env.actions(state)
        for action in actions:
            policy[state, action] = 1.0 / len(actions)
    return policy

In [155]:
def value_iteration_for_gamblers(policy_eval_fn, env, theta=0.0001, discount_factor=1.0):
    """
    Args:
        p_h: Probability of the coin coming up heads
    """
    
    policy = random_policy(env)
    new_policy = -1 * np.ones([env.goal, env.goal // 2 + 1])
    distance = 1
    i = 0
    
    while distance > theta and i < 1000:
        V = policy_eval_fn(policy, env, theta=theta, discount_factor=discount_factor)
        for state in env.states:
            actions = env.actions(state)
            q = q_fn(env, V, state, discount_factor)
            best_action = max(enumerate(q), key=lambda x: x[1])[0]
            for action in actions:
                if action == best_action:
                    new_policy[state, action] = 1
                else:
                    new_policy[state, action] = 0
        
        distance = np.linalg.norm(policy - new_policy)
        print(f"new policy, {i}, {distance} \n")
        policy = new_policy.copy()
        i += 1
    
    return policy, policy_eval_fn(policy, env, discount_factor=discount_factor, theta=theta)

In [158]:
env = GamblerEnvironment(p_H=0.25, goal=6)
rp = random_policy(env)
policy_evaluation(rp, env)

array([0.        , 0.05847664, 0.2339126 , 0.64326791, 1.05262025,
       1.78945592])

In [159]:
value_iteration_for_gamblers(policy_evaluation, env)

new policy, 0, 1.7559422921421233 
 [[-1. -1. -1. -1.]
 [ 0.  1. -1. -1.]
 [ 0.  0.  1. -1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  1. -1.]
 [ 0.  1. -1. -1.]]
new policy, 1, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 0.  1.  0. -1.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0. -1.]
 [ 0.  1. -1. -1.]]
new policy, 2, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 1.  0.  0. -1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  1. -1.]
 [ 1.  0. -1. -1.]]
new policy, 3, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 0.  1.  0. -1.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0. -1.]
 [ 0.  1. -1. -1.]]
new policy, 4, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 1.  0.  0. -1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  1. -1.]
 [ 1.  0. -1. -1.]]
new policy, 5, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 0.  1.  0. -1.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0. -1.]
 [ 0.  1. -1. -1.]]
new policy, 6, 2.8284271247461903 
 [[-1. -1. -1. -1.]
 [ 1.  0. -1. -1.]
 [ 1.  0.  0. -1.]
 

(array([[-1., -1., -1., -1.],
        [ 1.,  0., -1., -1.],
        [ 0.,  1.,  0., -1.],
        [ 1.,  0.,  0.,  0.],
        [ 1.,  0.,  0., -1.],
        [ 0.,  1., -1., -1.]]),
 array([0., 0., 0., 0., 0., 1.]))

In [154]:
policy_evaluation(random_policy(env), env)

array([0.        , 0.33332003, 0.6666513 , 0.99998669, 1.33331797,
       1.66665336])

In [119]:
env.transitions(5, 1)

[{'probability': 0.75, 'state': 6, 'reward': 1, 'done': True},
 {'probability': 0.25, 'state': 4, 'reward': 0, 'done': False}]