In [1]:
%matplotlib ipympl

import itertools
import pickle
import functools
import numpy as np
import scipy.special as sps
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook, tnrange, tqdm
tqdm.monitor_interval = 0 # https://github.com/tqdm/tqdm/issues/481

## Utility functions

In [2]:
"""
Fully cached memoizer decorator
from https://wiki.python.org/moin/PythonDecoratorLibrary
"""
def memoize(obj):
    cache = obj.cache = {}
    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        key = str(args) + str(kwargs)
        if key not in cache:
            cache[key] = obj(*args, **kwargs)
        return cache[key]
    return memoizer


"""
Poisson PDF
"""
@memoize
def poisson_prob(n, lam):
    return np.power(lam, n) / sps.factorial(n, exact=True) * np.exp(-lam)


"""
Evaluate probability of Poisson PDF for all ns >= n
"""
@memoize
def poisson_remaining_prob(n, lam):
    if n == 0:
        return 1.
    ns = np.arange(n)
    return 1. - np.sum(np.power(lam, ns) / sps.factorial(ns, exact=True) * np.exp(-lam))


## Transition functions (& state/action space dimensions)

In [3]:
MAX_NUM_CARS = 20
MAX_NUM_TRANSFERS = 5


"""
s, sp: 2x1 state - [num_cars_in_first_loc; num_cars_in_second_loc], each \in [0, MAX_NUM_CARS]
a: action \in [-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS], representing number of cars moved from first_loc to second_loc

Returns tuple: probability, (expected) reward
"""
@memoize
def transition_prob_and_reward_eg42(s, a, sp, max_num_cars=MAX_NUM_CARS, max_num_transfer=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10):
    nS = max_num_cars+1
    
    r = 0
    p = 0
    
    # Check for invalid actions
    if a > 0 and s[0] < a:
        return 0., 0
    elif a < 0 and s[1] < -a:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a
    s_after_transfer[1] += a
    if s_after_transfer[0] > max_num_cars:
        s_after_transfer[0] = max_num_cars
    if s_after_transfer[1] > max_num_cars:
        s_after_transfer[1] = max_num_cars

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=max_num_cars) and determine corresponding number of rentals
    exp_r_rentals = 0
    for num_returns_first_loc in range(max(0, sp[0]-s_after_transfer[0]), max_num_cars-s_after_transfer[0]+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        num_rentals_first_loc = cars_after_return_first_loc - sp[0]

        if cars_after_return_first_loc == max_num_cars:
            prob_returns_first_loc = poisson_remaining_prob(n=num_returns_first_loc, lam=3) # more returns still result in same car state
        else:
            prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        if sp[0] == 0:
            prob_rentals_first_loc = poisson_remaining_prob(n=num_rentals_first_loc, lam=3) # more rentals still result in same car state
        else:
            prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(max(0, sp[1]-s_after_transfer[1]), max_num_cars-s_after_transfer[1]+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            
            if cars_after_return_sec_loc == max_num_cars:
                prob_returns_sec_loc = poisson_remaining_prob(n=num_returns_sec_loc, lam=2) # more returns still result in same car state
            else:
                prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            if sp[1] == 0:
                prob_rentals_sec_loc = poisson_remaining_prob(n=num_rentals_sec_loc, lam=4) # more rentals still result in same car state
            else:
                prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            
            if cars_after_return_sec_loc == max_num_cars:
                break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
    
    return p, r


"""
Since the transition function above does not seem to generate the same solution as the book, try one where the transition probabilities are simpler despite being wrong:
- do not consider the possibility of excessive rentals and returns
- do not normalize the poisson distributions (since we only consider a finite subset of rentals and returns, yet still use the poisson PDF defined over Z)
"""
@memoize
def transition_prob_and_reward_eg42_poissononly(s, a, sp, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10):
    nS = max_num_cars+1
    
    r = 0
    p = 0
    
    # Check for invalid actions
    if a > 0 and s[0] < a:
        return 0., 0
    elif a < 0 and s[1] < -a:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a
    s_after_transfer[1] += a
    if s_after_transfer[0] > max_num_cars:
        s_after_transfer[0] = max_num_cars
    if s_after_transfer[1] > max_num_cars:
        s_after_transfer[1] = max_num_cars

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=MAX_NUM_CARS) and determine corresponding number of rentals
    exp_r_rentals = 0
    for num_returns_first_loc in range(max(0, sp[0]-s_after_transfer[0]), max_num_cars-s_after_transfer[0]+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        num_rentals_first_loc = cars_after_return_first_loc - sp[0]
        prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(max(0, sp[1]-s_after_transfer[1]), max_num_cars-s_after_transfer[1]+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            
            if cars_after_return_sec_loc == max_num_cars:
                break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
    
    return p, r


"""
action _a is now a 2-tuple: [a_transfer, a_employee]
"""
@memoize
def transition_prob_and_reward_ex45(s, a, sp, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10, max_num_cars_overflow=10, r_overflow=-4):
    nS = max_num_cars+1
    
    a_transfer, a_employee = a
    a_total = a_transfer+a_employee
    r = 0
    p = 0
    
    # Check for invalid actions
    if (a_transfer > 0 or a_employee > 0) and s[0] < a_total:
        return 0., 0
    elif (a_transfer < 0 or a_employee > 0) and s[1] < -a_total:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a_transfer)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a_total
    s_after_transfer[1] += a_total
    if s_after_transfer[0] > max_num_cars:
        s_after_transfer[0] = max_num_cars
    if s_after_transfer[1] > max_num_cars:
        s_after_transfer[1] = max_num_cars

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=max_num_cars) and determine corresponding number of rentals
    exp_r_rentals = 0
    exp_r_overflows = 0
    for num_returns_first_loc in range(max(0, sp[0]-s_after_transfer[0]), max_num_cars-s_after_transfer[0]+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        num_overflows_first_loc = 0 if cars_after_return_first_loc <= max_num_cars_overflow else (cars_after_return_first_loc - max_num_cars_overflow)
        num_rentals_first_loc = cars_after_return_first_loc - sp[0]

        if cars_after_return_first_loc == max_num_cars:
            prob_returns_first_loc = poisson_remaining_prob(n=num_returns_first_loc, lam=3) # more returns still result in same car state
        else:
            prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        if sp[0] == 0:
            prob_rentals_first_loc = poisson_remaining_prob(n=num_rentals_first_loc, lam=3) # more rentals still result in same car state
        else:
            prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(max(0, sp[1]-s_after_transfer[1]), max_num_cars-s_after_transfer[1]+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            num_overflows_sec_loc = 0 if cars_after_return_sec_loc <= max_num_cars_overflow else (cars_after_return_sec_loc - max_num_cars_overflow)
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            
            if cars_after_return_sec_loc == max_num_cars:
                prob_returns_sec_loc = poisson_remaining_prob(n=num_returns_sec_loc, lam=2) # more returns still result in same car state
            else:
                prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            if sp[1] == 0:
                prob_rentals_sec_loc = poisson_remaining_prob(n=num_rentals_sec_loc, lam=4) # more rentals still result in same car state
            else:
                prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            exp_r_overflows += r_overflow*(num_overflows_first_loc+num_overflows_sec_loc) * prob
            
            if cars_after_return_sec_loc == max_num_cars:
                break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
        exp_r_overflows /= p
        r += exp_r_overflows
    
    return p, r


## (Optional) Pre-cache / load-cached transition functions

In [4]:
# Pre-cache transition functions for example 4.2

nS = MAX_NUM_CARS+1
nA = 2*MAX_NUM_TRANSFERS+1
p_cache_eg42 = np.zeros(shape=(nS, nS, nA, nS, nS))
r_cache_eg42 = np.zeros(shape=(nS, nS, nA, nS, nS))
for s1 in tnrange(nS, desc='T[s1, ...]'):
    for s2, ai, s1p, s2p in itertools.product(range(nS), range(nA), range(nS), range(nS)):
        a = ai - MAX_NUM_TRANSFERS
        p_cache_eg42[s1, s2, ai, s1p, s2p], r_cache_eg42[s1, s2, ai, s1p, s2p] = transition_prob_and_reward_eg42(s=(s1, s2), a=a, sp=(s1p, s2p))

with open('pkl/Example_4.2_transition.pkl', 'wb') as f:
    pickle.dump({'fn': 'transition_prob_and_reward_eg42', 'p_cache_eg42': p_cache_eg42, 'r_cache_eg42': r_cache_eg42}, f)





In [5]:
# Pre-cache transition functions for exercise 4.5

nS = MAX_NUM_CARS+1
nA = 2*MAX_NUM_TRANSFERS+1
p_cache_ex45 = np.zeros(shape=(nS, nS, nA, 2, nS, nS))
r_cache_ex45 = np.zeros(shape=(nS, nS, nA, 2, nS, nS))
for s1 in tnrange(nS, desc='T[s1, ...]'):
    for s2, ai_transfer, a_employee, s1p, s2p in itertools.product(range(nS), range(nA), range(2), range(nS), range(nS)):
        a_transfer = ai_transfer - MAX_NUM_TRANSFERS
        p_cache_ex45[s1, s2, ai_transfer, a_employee, s1p, s2p], r_cache_ex45[s1, s2, ai_transfer, a_employee, s1p, s2p] = transition_prob_and_reward_ex45(s=(s1, s2), a=(a_transfer, a_employee), sp=(s1p, s2p))

with open('pkl/Exercise_4.5_transition.pkl', 'wb') as f:
    pickle.dump({'fn': 'transition_prob_and_reward_ex45', 'p_cache_ex45': p_cache_ex45, 'r_cache_ex45': r_cache_ex45}, f)





In [6]:
# Load cached transition functions

with open('pkl/Example_4.2_transition.pkl', 'rb') as f:
    d = pickle.load(f)
    p_cache_eg42, r_cache_eg42 = d['p_cache_eg42'], d['r_cache_eg42']

def transition_prob_and_reward_eg42_cached(s, a, sp, max_num_transfers=MAX_NUM_TRANSFERS):
    global p_cache_eg42, r_cache_eg42
    ai = a + max_num_transfers
    return p_cache_eg42[s[0], s[1], ai, sp[0], sp[1]], r_cache_eg42[s[0], s[1], ai, sp[0], sp[1]]

with open('pkl/Exercise_4.5_transition.pkl', 'rb') as f:
    d = pickle.load(f)
    p_cache_ex45, r_cache_ex45 = d['p_cache_ex45'], d['r_cache_ex45']

def transition_prob_and_reward_ex45_cached(s, a, sp, max_num_transfers=MAX_NUM_TRANSFERS):
    global p_cache_ex45, r_cache_ex45
    a_transfer, a_employee = a
    ai_transfer = a_transfer + max_num_transfers
    return p_cache_ex45[s[0], s[1], ai_transfer, a_employee, sp[0], sp[1]], r_cache_ex45[s[0], s[1], ai_transfer, a_employee, sp[0], sp[1]]


## Implementations of policy and value iteration

In [7]:
def policy_iteration_eg42(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS):
    nS = max_num_cars+1
    nA = 2*max_num_transfers+1
    stats = {'eval_delta': [], 'impr_num_a_diff': []}
    n_iter = 0
    iter_pbar = tqdm_notebook(desc='iter')
    eval_pbar = tqdm_notebook(desc='eval', leave=False)
    s1_pbar = tqdm_notebook(desc='loop(s1, ...)', leave=False)

    # Following p. 65 - Policy iteration (using iterative policy evaluation)
    # 1. Initialization
    V = np.zeros((nS, nS))
    pi = np.random.randint(low=-max_num_transfers, high=max_num_transfers+1, size=(nS, nS))
    iter_pbar.update()

    while True:
        stats['eval_delta'].append([])
        
        # 2. Policy Evaluation
        while True:
            delta = 0
            s1_prev = None
            for s1, s2 in itertools.product(range(nS), range(nS)):
                if s1 != s1_prev:
                    s1_prev = s1
                    s1_pbar.update()

                v = V[s1, s2]
                v_new = 0
                for s1p, s2p in itertools.product(range(nS), range(nS)):
                    p, r = transition_prob_and_reward_fn(s=(s1, s2), a=int(pi[s1, s2]), sp=(s1p, s2p))
                    v_new += p*(r + gamma*V[s1p, s2p])
                V[s1, s2] = v_new
                delta = max(delta, abs(v-v_new))

            stats['eval_delta'][-1].append(delta)
            eval_pbar.update()
            eval_pbar.set_postfix(delta=delta)
            
            if delta < theta:
                break
        iter_pbar.set_postfix(delta=delta)

        # 3. Policy Improvement
        num_a_diff = 0
        s1_prev = None
        for s1, s2 in itertools.product(range(nS), range(nS)):
            if s1 != s1_prev:
                s1_prev = s1
                s1_pbar.update()
            old_action = pi[s1, s2]
            value_backups = np.zeros(nA)
            for ai, s1p, s2p in itertools.product(range(nA), range(nS), range(nS)):
                a = ai - max_num_transfers
                p, r = transition_prob_and_reward_fn(s=(s1, s2), a=int(a), sp=(s1p, s2p))
                value_backups[ai] += p*(r + gamma*V[s1p, s2p])
            ai_best = np.argmax(value_backups)
            a_best = ai_best - max_num_transfers
            pi[s1, s2] = a_best
            if old_action != a_best:
                num_a_diff += 1

        n_iter += 1
        stats['impr_num_a_diff'].append(num_a_diff)
        iter_pbar.update()
        iter_pbar.set_postfix(num_a_diff=num_a_diff)
        
        if num_a_diff == 0:
            break
    
    s1_pbar.close()
    eval_pbar.close()
    print('Converged in %d iterations' % n_iter)
    
    return V, pi, stats


def value_iteration_eg42(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS):
    nS = max_num_cars+1
    nA = 2*max_num_transfers+1
    stats = {'delta': []}
    n_iter = 0
    iter_pbar = tqdm_notebook(desc='iter')
    s1_pbar = tqdm_notebook(desc='loop(s1, ...)', leave=False)

    # Following p. 67 - Value iteration
    # Initialization
    V = np.zeros(shape=(nS, nS))
    iter_pbar.update()

    # Value evaluation and greedy policy evaluation
    pi = np.zeros(shape=(nS, nS), dtype='int8')
    while True:
        delta = 0
        s1_prev = None
        for s1, s2 in itertools.product(range(nS), range(nS)):
            if s1 != s1_prev:
                s1_prev = s1
                s1_pbar.update()
            v = V[s1, s2]
            value_backups = np.zeros(nA)
            for ai, s1p, s2p in itertools.product(range(nA), range(nS), range(nS)):
                a = ai - max_num_transfers
                p, r = transition_prob_and_reward_fn(s=(s1, s2), a=int(a), sp=(s1p, s2p))
                value_backups[ai] += p*(r + gamma*V[s1p, s2p])
            ai_best = np.argmax(value_backups)
            a_best = ai_best - max_num_transfers
            pi[s1, s2] = a_best
            v_new = value_backups[ai_best]
            V[s1, s2] = v_new
            delta = max(delta, abs(v-v_new))

        n_iter += 1
        stats['delta'].append(delta)
        iter_pbar.update()
        iter_pbar.set_postfix(delta=delta)

        if delta < theta:
            break
    
    s1_pbar.close()
    print('Converged in %d iterations' % n_iter)
    
    return V, pi, stats


def q_value_iteration_eg42(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS):
    nS = max_num_cars+1
    nA = 2*max_num_transfers+1
    stats = {'delta': []}
    n_iter = 0
    iter_pbar = tqdm_notebook(desc='iter')
    s1_pbar = tqdm_notebook(desc='loop(s1, ...)', leave=False)

    # Initialization
    Q = np.zeros((nS, nS, nA))
    iter_pbar.update()

    # Q-value improvement via greedy one-step backup
    while True:
        delta = 0
        s1_prev = None
        for s1, s2, ai in itertools.product(range(nS), range(nS), range(nA)):
            if s1 != s1_prev:
                s1_prev = s1
                s1_pbar.update()
            a = ai - max_num_transfers
            q = Q[s1, s2, ai]
            q_new = 0
            for s1p, s2p in itertools.product(range(nS), range(nS)):
                p, r = transition_prob_and_reward_fn(s=(s1, s2), a=int(a), sp=(s1p, s2p))
                q_new += p*(r + gamma*np.max(Q[s1p, s2p, :]))
            Q[s1, s2, ai] = q_new
            delta = max(delta, abs(q-q_new))
            
        n_iter += 1
        stats['delta'].append(delta)
        iter_pbar.update()
        iter_pbar.set_postfix(delta=delta)

        if delta < theta:
            break

    # Extract greedy policy
    pi = np.zeros(shape=(nS, nS), dtype='int8')
    for s1, s2 in itertools.product(range(nS), range(nS)):
        ai_best = np.argmax(Q[s1, s2, :])
        a_best = ai_best - max_num_transfers
        pi[s1, s2] = a_best
        
    s1_pbar.close()
    print('Converged in %d iterations' % n_iter)
    
    return Q, pi, stats


def policy_iteration_ex45(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, max_num_cars=MAX_NUM_CARS, max_num_transfers=MAX_NUM_TRANSFERS):
    nS = max_num_cars+1
    nA = 2*max_num_transfers+1
    stats = {'eval_delta': [], 'impr_num_a_diff': []}
    n_iter = 0
    iter_pbar = tqdm_notebook(desc='iter')
    eval_pbar = tqdm_notebook(desc='eval', leave=False)
    s1_pbar = tqdm_notebook(desc='loop(s1, ...)', leave=False)

    # Following p. 65 - Policy iteration (using iterative policy evaluation)
    # 1. Initialization
    V = np.zeros((nS, nS))
    pi = np.stack((np.random.randint(low=-max_num_transfers, high=max_num_transfers+1, size=(nS, nS)),
                  np.random.randint(low=0, high=1+1, size=(nS, nS))))
    iter_pbar.update()

    while True:
        stats['eval_delta'].append([])
        
        # 2. Policy Evaluation
        while True:
            delta = 0
            s1_prev = None
            for s1, s2 in itertools.product(range(nS), range(nS)):
                if s1 != s1_prev:
                    s1_prev = s1
                    s1_pbar.update()

                v = V[s1, s2]
                v_new = 0
                for s1p, s2p in itertools.product(range(nS), range(nS)):
                    p, r = transition_prob_and_reward_fn(s=(s1, s2), a=tuple(int(a) for a in pi[:, s1, s2]), sp=(s1p, s2p))
                    v_new += p*(r + gamma*V[s1p, s2p])
                V[s1, s2] = v_new
                delta = max(delta, abs(v-v_new))

            stats['eval_delta'][-1].append(delta)
            eval_pbar.update()
            eval_pbar.set_postfix(delta=delta)
            
            if delta < theta:
                break
        iter_pbar.set_postfix(delta=delta)

        # 3. Policy Improvement
        num_a_diff = 0
        s1_prev = None
        for s1, s2 in itertools.product(range(nS), range(nS)):
            if s1 != s1_prev:
                s1_prev = s1
                s1_pbar.update()
            old_action = pi[:, s1, s2]
            value_backups = np.zeros((nA, 2))
            for ai, a_employee, s1p, s2p in itertools.product(range(nA), range(2), range(nS), range(nS)):
                a_transfer = ai - max_num_transfers
                p, r = transition_prob_and_reward_fn(s=(s1, s2), a=(int(a_transfer), int(a_employee)), sp=(s1p, s2p))
                value_backups[ai, a_employee] += p*(r + gamma*V[s1p, s2p])
            ai_best = np.argmax(value_backups)
            ai_transfer_best, a_employee_best = np.unravel_index(ai_best, value_backups.shape)
            a_transfer_best = ai_transfer_best - max_num_transfers
            pi[0, s1, s2], pi[1, s1, s2] = a_transfer_best, a_employee_best
            if old_action[0] != a_transfer_best or old_action[1] != a_employee_best:
                num_a_diff += 1

        n_iter += 1
        stats['impr_num_a_diff'].append(num_a_diff)
        iter_pbar.update()
        iter_pbar.set_postfix(num_a_diff=num_a_diff)
        
        if num_a_diff == 0:
            break
    
    s1_pbar.close()
    eval_pbar.close()
    print('Converged in %d iterations' % n_iter)
    
    return V, pi, stats


## Policy iteration

In [8]:
# Compute policy iteration for example 4.2 using cached transition fn

V, pi, stats = policy_iteration_eg42(transition_prob_and_reward_eg42_cached)

with open('pkl/Example_4.2_cached.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 7 iterations



In [9]:
# Compute policy iteration for example 4.2

V, pi, stats = policy_iteration_eg42(transition_prob_and_reward_eg42)

with open('pkl/Example_4.2.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 5 iterations



In [10]:
# Compute policy iteration for example 4.2 using simpler but technically incorrect transition function

V, pi, stats = policy_iteration_eg42(transition_prob_and_reward_eg42_poissononly)

with open('pkl/Example_4.2_poissononly.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 5 iterations



In [11]:
# Compute policy iteration for example 4.5 using cached transition fn

V, pi, stats = policy_iteration_ex45(transition_prob_and_reward_ex45_cached)

with open('pkl/Exercise_4.5_cached.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 1 iterations



## Value iteration

In [12]:
# Compute value iteration for example 4.2 using cached transition fn

V, pi, stats = value_iteration_eg42(transition_prob_and_reward_eg42_cached)

with open('pkl/Example_4.2_V_cached.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 40 iterations



In [13]:
# Compute value iteration for example 4.2

V, pi, stats = value_iteration_eg42(transition_prob_and_reward_eg42)

with open('pkl/Example_4.2_V.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'stats': stats}, f)


Converged in 40 iterations



## Q-value iteration

In [14]:
# Compute Q-value iteration for example 4.2 using cached transition fn

Q, pi, stats = q_value_iteration_eg42(transition_prob_and_reward_eg42_cached)

with open('pkl/Example_4.2_Q_cached.pkl', 'wb') as f:
    pickle.dump({'Q': Q, 'pi': pi, 'stats': stats}, f)


Converged in 39 iterations



In [15]:
# Compute Q-value iteration for example 4.2

Q, pi, stats = q_value_iteration_eg42(transition_prob_and_reward_eg42)

with open('pkl/Example_4.2_Q.pkl', 'wb') as f:
    pickle.dump({'Q': Q, 'pi': pi, 'stats': stats}, f)


Converged in 39 iterations



## Visualizations

In [26]:
# Visualize final value function and policy for example 4.2

with open('pkl/Example_4.2.pkl', 'rb') as f:
    d = pickle.load(f)
    V_ref, pi_ref, stats_ref = d['V'], d['pi'], d['stats']

#results_filename = 'pkl/Example_4.2.pkl'
results_filename = 'pkl/Example_4.2_cached.pkl'
#results_filename = 'pkl/Example_4.2_poissononly.pkl' # WARNING: results drastically different, yet not so in earlier implementation...
#results_filename = 'pkl/Example_4.2_V.pkl'
#results_filename = 'pkl/Example_4.2_V_cached.pkl'
with open(results_filename, 'rb') as f:
    d = pickle.load(f)
    V, pi, stats = d['V'], d['pi'], d['stats']
    
V_diff = V.astype('float') - V_ref.astype('float')
pi_diff = (pi.astype('int8') != pi_ref.astype('int8'))
print('mean V diff:', np.mean(np.abs(V_diff[:])))
print('num pi diff:', np.sum(pi_diff[:]))
    
plt.matshow(V, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Value function')

plt.matshow(pi, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy')

mean V diff: 0.0924151900394
num pi diff: 6




Text(0.5,1.05,'Policy')

In [30]:
# Visualize final Q-value functions and policy for example 4.2

with open('pkl/Example_4.2_Q.pkl', 'rb') as f:
    d = pickle.load(f)
    Q_ref, pi_ref, stats_ref = d['Q'], d['pi'], d['stats']

#results_filename = 'pkl/Example_4.2_Q.pkl'
results_filename = 'pkl/Example_4.2_Q_cached.pkl'
with open(results_filename, 'rb') as f:
    d = pickle.load(f)
    Q, pi, stats = d['Q'], d['pi'], d['stats']
    
Q_diff = Q.astype('float') - Q_ref.astype('float')
pi_diff = (pi.astype('int8') != pi_ref.astype('int8'))
print('mean Q diff:', np.mean(np.abs(Q_diff[:])))
print('num pi diff:', np.sum(pi_diff[:]))

for ai, a in enumerate(range(-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS+1)):
    plt.matshow(Q[:, :, ai], origin='lower')
    plt.xlabel('Number of cars in first location')
    plt.ylabel('Number of cars in second location')
    plt.title('Q-Value function for a=%d' % a)

plt.matshow(pi, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy function')

mean Q diff: 0.0
num pi diff: 0




Text(0.5,1.05,'Policy function')

In [18]:
# Visualize final value function and policy for exercise 4.5

results_filename = 'pkl/Exercise_4.5_cached.pkl'
with open(results_filename, 'rb') as f:
    d = pickle.load(f)
    V, pi, stats = d['V'], d['pi'], d['stats']
    
plt.matshow(V, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Value function')

plt.matshow(pi[0], origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy: transfers without employee +1')

plt.matshow(pi[1], origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy: transfers with employee +1')


Text(0.5,1.05,'Policy: transfers with employee +1')