In [1]:
%matplotlib ipympl

import pickle
import functools
import numpy as np
import scipy.special as sps
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook, tnrange

In [2]:
MAX_NUM_CARS = 20
MAX_NUM_TRANSFERS = 5


def memoize(obj): # from https://wiki.python.org/moin/PythonDecoratorLibrary
    cache = obj.cache = {}
    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        key = str(args) + str(kwargs)
        if key not in cache:
            cache[key] = obj(*args, **kwargs)
        return cache[key]
    return memoizer


@memoize
def poisson_prob(n, lam):
    return np.power(lam, n) / sps.factorial(n, exact=True) * np.exp(-lam)


"""
Poisson probability for all ns >= n
"""
@memoize
def poisson_remaining_prob(n, lam):
    if n == 0:
        return 1.
    ns = np.arange(n)
    return 1. - np.sum(np.power(lam, ns) / sps.factorial(ns, exact=True) * np.exp(-lam))


def policy_iteration_Q(transition_prob_and_reward_fn, gamma=0.9, theta=1e-6, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS):
    nS = MAX_NUM_CARS+1
    nA = MAX_NUM_TRANSFERS*2+1

    # Following p. 65 - Policy iteration (using iterative policy evaluation)
    # 1. Initialization
    Q = np.random.rand(nS, nS, nA)*400 # Re-scaled based on guess
    pi = np.random.randint(low=-MAX_NUM_TRANSFERS, high=MAX_NUM_TRANSFERS+1, size=(nS, nS))
    n_iters = []

    while True:
        # 2. Policy Evaluation
        n_evals = 0
        while True:
            delta = 0
            for s1 in tnrange(nS, desc='eval %d s1' % n_evals):
                for s2 in range(nS):
                    a = pi[s1, s2]
                    q = Q[s1, s2, a]
                    q_new = 0
                    for s1p in range(nS):
                        for s2p in range(nS):
                            p, r = transition_prob_and_reward_fn(s=(s1, s2), a=a, sp=(s1p, s2p))
                            ap = pi[s1p, s2p]
                            q_new += p*(r + gamma*Q[s1p, s2p, ap])
                    Q[s1, s2, a] = q_new
                    delta = max(delta, abs(q-q_new))
            n_evals += 1
            print('delta:', delta)
            if delta < theta:
                break

        # 3. Policy Improvement
        n_iters.append(n_evals)
        num_a_diff = 0
        for s1 in tnrange(nS, desc='impr %d s1' % len(n_iters)):
            for s2 in range(nS):
                old_action = pi[s1, s2]
                ai_max = np.argmax(Q[s1, s2, :])
                a_max = int(ai_max - MAX_NUM_TRANSFERS)
                pi[s1, s2] = a_max
                if old_action != a_max:
                    num_a_diff += 1
        
        print('num_a_diff',num_a_diff)
        
        if num_a_diff == 0:
            break
    
    return Q, pi, n_iters


In [3]:
# Example 4.2

"""
s, sp: 2x1 state - [num_cars_in_first_loc; num_cars_in_second_loc], each \in [0, MAX_NUM_CARS]
a: action \in [-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS], representing number of cars moved from first_loc to second_loc

Returns tuple: probability, (expected) reward
"""
@memoize
def transition_prob_and_reward_ex42(s, a, sp, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10):
    nS = MAX_NUM_CARS+1
    
    r = 0
    p = 0
    
    # Check for invalid actions
    if a > 0 and s[0] < a:
        return 0., 0
    elif a < 0 and s[1] < -a:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a
    s_after_transfer[1] += a
    if s_after_transfer[0] > MAX_NUM_CARS:
        s_after_transfer[0] = MAX_NUM_CARS
    if s_after_transfer[1] > MAX_NUM_CARS:
        s_after_transfer[1] = MAX_NUM_CARS

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=MAX_NUM_CARS) and determine corresponding number of rentals
    exp_r_rentals = 0
    for num_returns_first_loc in range(MAX_NUM_CARS+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        if cars_after_return_first_loc == MAX_NUM_CARS:
            prob_returns_first_loc = poisson_remaining_prob(n=num_returns_first_loc, lam=3) # more returns still result in same car state
        else:
            prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        num_rentals_first_loc = cars_after_return_first_loc - sp[0]
        if num_rentals_first_loc < 0:
            continue
        if sp[0] == 0:
            prob_rentals_first_loc = poisson_remaining_prob(n=num_rentals_first_loc, lam=3) # more rentals still result in same car state
        else:
            prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(MAX_NUM_CARS+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            if cars_after_return_sec_loc == MAX_NUM_CARS:
                prob_returns_sec_loc = poisson_remaining_prob(n=num_returns_sec_loc, lam=2) # more returns still result in same car state
            else:
                prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            if num_rentals_sec_loc < 0:
                continue
            if sp[1] == 0:
                prob_rentals_sec_loc = poisson_remaining_prob(n=num_rentals_sec_loc, lam=4) # more rentals still result in same car state
            else:
                prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            
            if cars_after_return_sec_loc == MAX_NUM_CARS:
                break
            
        if cars_after_return_first_loc == MAX_NUM_CARS:
            break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
    
    return p, r


In [None]:
Q, pi, n_iters = policy_iteration_Q(transition_prob_and_reward_ex42)

with open('Exercise_4.5_Q.pkl', 'wb') as f:
    pickle.dump({'Q': Q, 'pi': pi, 'n_iters': n_iters}, f)

In [None]:
with open('Exercise_4.5_Q.pkl', 'rb') as f:
    d = pickle.load(f)
    Q, pi, n_iters = d['Q'], d['pi'], d['n_iters']
    
for ai, a in enumerate(range(-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS+1)):
    plt.matshow(Q[:, :, ai])
    plt.xlabel('Number of cars in first location')
    plt.ylabel('Number of cars in second location')
    plt.title('Q function (a=%d)' % a)

plt.matshow(pi)
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy function')