In [11]:
%matplotlib ipympl

import itertools
import pickle
import functools
import numpy as np
import scipy.special as sps
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook, tnrange

## Utility functions

In [12]:
"""
Fully cached memoizer decorator
from https://wiki.python.org/moin/PythonDecoratorLibrary
"""
def memoize(obj):
    cache = obj.cache = {}
    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        key = str(args) + str(kwargs)
        if key not in cache:
            cache[key] = obj(*args, **kwargs)
        return cache[key]
    return memoizer


"""
Poisson PDF
"""
@memoize
def poisson_prob(n, lam):
    return np.power(lam, n) / sps.factorial(n, exact=True) * np.exp(-lam)


"""
Evaluate probability of Poisson PDF for all ns >= n
"""
@memoize
def poisson_remaining_prob(n, lam):
    if n == 0:
        return 1.
    ns = np.arange(n)
    return 1. - np.sum(np.power(lam, ns) / sps.factorial(ns, exact=True) * np.exp(-lam))


## Transition functions (& state/action space dimensions)

In [13]:
MAX_NUM_CARS = 20
MAX_NUM_TRANSFERS = 5


"""
s, sp: 2x1 state - [num_cars_in_first_loc; num_cars_in_second_loc], each \in [0, MAX_NUM_CARS]
a: action \in [-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS], representing number of cars moved from first_loc to second_loc

Returns tuple: probability, (expected) reward
"""
@memoize
def transition_prob_and_reward_ex42(s, a, sp, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10):
    nS = MAX_NUM_CARS+1
    
    r = 0
    p = 0
    
    # Check for invalid actions
    if a > 0 and s[0] < a:
        return 0., 0
    elif a < 0 and s[1] < -a:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a
    s_after_transfer[1] += a
    if s_after_transfer[0] > MAX_NUM_CARS:
        s_after_transfer[0] = MAX_NUM_CARS
    if s_after_transfer[1] > MAX_NUM_CARS:
        s_after_transfer[1] = MAX_NUM_CARS

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=MAX_NUM_CARS) and determine corresponding number of rentals
    exp_r_rentals = 0
    for num_returns_first_loc in range(max(0, sp[0]-s_after_transfer[0]), MAX_NUM_CARS-s_after_transfer[0]+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        num_rentals_first_loc = cars_after_return_first_loc - sp[0]

        if cars_after_return_first_loc == MAX_NUM_CARS:
            prob_returns_first_loc = poisson_remaining_prob(n=num_returns_first_loc, lam=3) # more returns still result in same car state
        else:
            prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        if sp[0] == 0:
            prob_rentals_first_loc = poisson_remaining_prob(n=num_rentals_first_loc, lam=3) # more rentals still result in same car state
        else:
            prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(max(0, sp[1]-s_after_transfer[1]), MAX_NUM_CARS-s_after_transfer[1]+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            
            if cars_after_return_sec_loc == MAX_NUM_CARS:
                prob_returns_sec_loc = poisson_remaining_prob(n=num_returns_sec_loc, lam=2) # more returns still result in same car state
            else:
                prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            if sp[1] == 0:
                prob_rentals_sec_loc = poisson_remaining_prob(n=num_rentals_sec_loc, lam=4) # more rentals still result in same car state
            else:
                prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            
            if cars_after_return_sec_loc == MAX_NUM_CARS:
                break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
    
    return p, r


"""
Since the transition function above does not seem to generate the same solution as the book, try one where the transition probabilities are simpler despite being wrong:
- do not consider the possibility of excessive rentals and returns
- do not normalize the poisson distributions (since we only consider a finite subset of rentals and returns, yet still use the poisson PDF defined over Z)
"""
@memoize
def transition_prob_and_reward_ex42_poissononly(s, a, sp, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS, r_transfer=-2, r_rental=10):
    nS = MAX_NUM_CARS+1
    
    r = 0
    p = 0
    
    # Check for invalid actions
    if a > 0 and s[0] < a:
        return 0., 0
    elif a < 0 and s[1] < -a:
        return 0., 0
    
    # Determine number of cars after transfer, before returns and new rentals
    r = r_transfer*abs(a)
    s_after_transfer = np.copy(s)
    s_after_transfer[0] -= a
    s_after_transfer[1] += a
    if s_after_transfer[0] > MAX_NUM_CARS:
        s_after_transfer[0] = MAX_NUM_CARS
    if s_after_transfer[1] > MAX_NUM_CARS:
        s_after_transfer[1] = MAX_NUM_CARS

    # Compute sum of probabilities by enumerating all possible returns (accumulate all remaining probs on cars=MAX_NUM_CARS) and determine corresponding number of rentals
    exp_r_rentals = 0
    for num_returns_first_loc in range(max(0, sp[0]-s_after_transfer[0]), MAX_NUM_CARS-s_after_transfer[0]+1):
        cars_after_return_first_loc = s_after_transfer[0]+num_returns_first_loc
        num_rentals_first_loc = cars_after_return_fiarst_loc - sp[0]
        prob_returns_first_loc = poisson_prob(n=num_returns_first_loc, lam=3)
        prob_rentals_first_loc = poisson_prob(n=num_rentals_first_loc, lam=3)
        
        for num_returns_sec_loc in range(max(0, sp[1]-s_after_transfer[1]), MAX_NUM_CARS-s_after_transfer[1]+1):
            cars_after_return_sec_loc = s_after_transfer[1]+num_returns_sec_loc
            num_rentals_sec_loc = cars_after_return_sec_loc - sp[1]
            prob_returns_sec_loc = poisson_prob(n=num_returns_sec_loc, lam=2)
            prob_rentals_sec_loc = poisson_prob(n=num_rentals_sec_loc, lam=4)

            prob = prob_returns_first_loc * prob_returns_sec_loc * prob_rentals_first_loc * prob_rentals_sec_loc
            p += prob
            
            exp_r_rentals += r_rental*(num_rentals_first_loc+num_rentals_sec_loc) * prob
            
            if cars_after_return_sec_loc == MAX_NUM_CARS:
                break
    
    if p > 0:
        exp_r_rentals /= p
        r += exp_r_rentals
    
    return p, r


## (Optional) Pre-cache / load-cached transition functions

In [8]:
# Pre-cache transition functions

nS = MAX_NUM_CARS+1
nA = 2*MAX_NUM_TRANSFERS+1
p_cache = np.zeros(shape=(nS, nS, nA, nS, nS))
r_cache = np.zeros(shape=(nS, nS, nA, nS, nS))
for s1 in tnrange(nS, desc='T[s1, ...]'):
    for s2, ai, s1p, s2p in itertools.product(range(nS), range(nA), range(nS), range(nS)):
        a = ai - MAX_NUM_TRANSFERS
        p_cache[s1, s2, ai, s1p, s2p], r_cache[s1, s2, ai, s1p, s2p] = transition_prob_and_reward_ex42(s=(s1, s2), a=a, sp=(s1p, s2p))

with open('pkl/Example_4.2_transition.pkl', 'wb') as f:
    pickle.dump({'fn': 'transition_prob_and_reward_ex42', 'p_cache': p_cache, 'r_cache': r_cache}, f)






In [None]:
# Load cached transition function

with open('pkl/Example_4.2_transition.pkl', 'rb') as f:
    d = pickle.load(f)
    p_cache, r_cache = d['p_cache'], d['r_cache']

def transition_prob_and_reward_ex42_cached(s, a, sp, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS):
    global p_cache, r_cache
    ai = a + MAX_NUM_TRANSFERS
    return p_cache[s[0], s[1], ai, sp[0], sp[1]], r_cache[s[0], s[1], ai, sp[0], sp[1]]


## Implementations of policy and value iteration

In [30]:
def policy_iteration(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS):
    nS = MAX_NUM_CARS+1
    nA = 2*MAX_NUM_TRANSFERS+1

    # Following p. 65 - Policy iteration (using iterative policy evaluation)
    # 1. Initialization
    V = np.random.rand(nS, nS)*200 + 420 # Re-scaled based on Figure 4.2
    pi = np.random.randint(low=-MAX_NUM_TRANSFERS, high=MAX_NUM_TRANSFERS+1, size=(nS, nS))
    n_iters = []

    while True:
        # 2. Policy Evaluation
        n_evals = 0
        while True:
            delta = 0
            for s1 in tnrange(nS, desc='%d-%d eval' % (len(n_iters), n_evals)):
                for s2 in range(nS):
                    v = V[s1, s2]
                    v_new = 0
                    for s1p, s2p in itertools.product(range(nS), range(nS)):
                        p, r = transition_prob_and_reward_fn(s=(s1, s2), a=pi[s1, s2], sp=(s1p, s2p))
                        v_new += p*(r + gamma*V[s1p, s2p])
                    V[s1, s2] = v_new
                    delta = max(delta, abs(v-v_new))
            n_evals += 1
            print('delta:', delta)
            if delta < theta:
                break

        # 3. Policy Improvement
        n_iters.append(n_evals)
        num_a_diff = 0
        for s1 in tnrange(nS, desc='%d impr' % len(n_iters)):
            for s2 in range(nS):
                old_action = pi[s1, s2]
                backups = np.zeros(shape=(nA,), dtype='float32')
                for ai, s1p, s2p in itertools.product(range(nA), range(nS), range(nS)):
                    a = ai - MAX_NUM_TRANSFERS
                    p, r = transition_prob_and_reward_fn(s=[s1, s2], a=a, sp=[s1p, s2p])
                    backups[ai] += p*(r + gamma*V[s1p, s2p])
                ai_max = np.argmax(backups)
                a_max = int(ai_max - MAX_NUM_TRANSFERS)
                pi[s1, s2] = a_max
                if old_action != a_max:
                    num_a_diff += 1
        print('num_a_diff',num_a_diff)
        if num_a_diff == 0:
            break
    
    return V, pi, n_iters


def value_iteration(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS):
    nS = MAX_NUM_CARS+1
    nA = 2*MAX_NUM_TRANSFERS+1

    # Following p. 67 - Value iteration
    # Initialization
    V = np.zeros(shape=(nS, nS), dtype='float32')
    n_iters = 0

    # Value evaluation and greedy policy evaluation
    pi = np.zeros(shape=(nS, nS), dtype='int8')
    while True:
        delta = 0
        for s1 in tnrange(nS, desc='%d impr' % n_iters):
            for s2 in range(nS):
                v = V[s1, s2]
                backups = np.zeros(shape=(nA,), dtype='float32')
                for ai, s1p, s2p in itertools.product(range(nA), range(nS), range(nS)):
                    a = ai - MAX_NUM_TRANSFERS
                    p, r = transition_prob_and_reward_fn(s=[s1, s2], a=a, sp=[s1p, s2p])
                    backups[ai] += p*(r + gamma*V[s1p, s2p])
                ai_max = np.argmax(backups)
                a_max = int(ai_max - MAX_NUM_TRANSFERS)
                pi[s1, s2] = a_max
                v_new = backups[ai_max]
                V[s1, s2] = v_new
                delta = max(delta, abs(v-v_new))
        n_iters += 1
        print('delta:', delta)
        if delta < theta:
            break
    
    return V, pi, n_iters


def q_value_iteration(transition_prob_and_reward_fn, gamma=0.9, theta=1e-1, MAX_NUM_CARS=MAX_NUM_CARS, MAX_NUM_TRANSFERS=MAX_NUM_TRANSFERS):
    nS = MAX_NUM_CARS+1
    nA = 2*MAX_NUM_TRANSFERS+1

    # Initialization
    Q = np.random.rand(nS, nS, nA)*500 # Arbitrarily-guessed scale
    n_iters = 0

    # Q-value improvement via greedy one-step backup
    while True:
        delta = 0
        for s1 in tnrange(nS, desc='%d impr' % n_iters):
            for s2, ai in itertools.product(range(nS), range(nA)):
                a = ai - MAX_NUM_TRANSFERS
                q = Q[s1, s2, ai]
                q_new = 0
                for s1p, s2p in itertools.product(range(nS), range(nS)):
                    p, r = transition_prob_and_reward_fn(s=(s1, s2), a=a, sp=(s1p, s2p))
                    q_new += p*(r + gamma*np.max(Q[s1p, s2p, :]))
                Q[s1, s2, ai] = q_new
                delta = max(delta, abs(q-q_new))
        n_iters += 1
        print('delta:', delta)
        if delta < theta:
            break

    # Extract greedy policy
    pi = np.zeros(shape=(nS, nS), dtype='int8')
    for s1, s2 in itertools.product(range(nS), range(nS)):
        ai_max = np.argmax(Q[s1, s2, :])
        a_max = int(ai_max - MAX_NUM_TRANSFERS)
        pi[s1, s2] = a_max
    
    return Q, pi, n_iters


## Policy iteration

In [25]:
# Compute policy iteration for example 4.2 using cached transition fn

V, pi, n_iters = policy_iteration(transition_prob_and_reward_ex42_cached)

with open('pkl/Exercise_4.5_cached.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'n_iters': n_iters}, f)



delta: 617.004429428



delta: 389.117163838



delta: 120.225160686



delta: 72.0805822664



delta: 43.5566403587



delta: 24.7371444571



delta: 12.645466761



delta: 6.10508479605



delta: 2.84668129545



delta: 1.29754108154



delta: 0.582303454005



delta: 0.258462232435



delta: 0.113809851748



delta: 0.0498204441267



num_a_diff 396



delta: 349.011140146



delta: 78.5603851102



delta: 57.0701477742



delta: 49.5331255426



delta: 41.8724925893



delta: 35.0974336986



delta: 29.3198081947



delta: 24.4546744749



delta: 20.3799433549



delta: 16.9762469647



delta: 14.1371443603



delta: 11.7709146389



delta: 9.79974716149



delta: 8.15816052054



delta: 6.79129275913



delta: 5.65329633821



delta: 4.70591484212



delta: 3.91725524616



delta: 3.26074437613



delta: 2.71424927037



delta: 2.25933929576



delta: 1.88066897424



delta: 1.56546269025



delta: 1.30308493506



delta: 1.08468213708



delta: 0.902884284573



delta: 0.751556428739



delta: 0.625591765948



delta: 0.520739362064



delta: 0.43346073004



delta: 0.360810434304



delta: 0.300336700092



delta: 0.249998677656



delta: 0.208097571491



delta: 0.17321931157



delta: 0.144186832594



delta: 0.120020351158



delta: 0.0999043004842



num_a_diff 336



delta: 7.51239086752



delta: 3.49602669838



delta: 1.60491226568



delta: 1.2008984275



delta: 0.943649730592



delta: 0.719675416238



delta: 0.551152766149



delta: 0.446202581795



delta: 0.376296134083



delta: 0.316549200653



delta: 0.265914985944



delta: 0.223197315655



delta: 0.187250180996



delta: 0.157045480478



delta: 0.131688527531



delta: 0.110412896103



delta: 0.0925677271696



num_a_diff 103



delta: 0.734298292008



delta: 0.149188542127



delta: 0.105170128575



delta: 0.0883607726039



num_a_diff 9



delta: 0.0767491469892



num_a_diff 0


In [19]:
# Compute policy iteration for example 4.2

V, pi, n_iters = policy_iteration(transition_prob_and_reward_ex42)

with open('pkl/Exercise_4.5.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'n_iters': n_iters}, f)



delta: 611.08552984



delta: 322.037483578



delta: 119.36497688



delta: 63.0730744873



delta: 37.7029448721



delta: 21.8011131581



delta: 11.438984592



delta: 5.79580432335



delta: 3.14854453183



delta: 1.70796078745



delta: 0.925645360824



delta: 0.50135688653



delta: 0.271437786462



delta: 0.14691603363



delta: 0.0795022672146



num_a_diff 400



delta: 348.104384173



delta: 74.3756562021



delta: 56.7247863433



delta: 44.4496755633



delta: 35.3570346936



delta: 29.988029478



delta: 25.2392459775



delta: 21.1657784812



delta: 17.7176401788



delta: 14.8171558215



delta: 12.3850795492



delta: 10.3491838153



delta: 8.64649712457



delta: 7.22322340658



delta: 6.03386866684



delta: 5.04016477226



delta: 4.21001601893



delta: 3.51654823433



delta: 2.93728101268



delta: 2.4534201244



delta: 2.0492585136



delta: 1.71167197322



delta: 1.42969586552



delta: 1.19417052285



delta: 0.997444517229



delta: 0.833126524768



delta: 0.69587791467



delta: 0.58123942523



delta: 0.485486349122



delta: 0.405507551618



delta: 0.338704406397



delta: 0.282906373593



delta: 0.236300481643



delta: 0.197372424171



delta: 0.164857359402



delta: 0.137698814213



delta: 0.11501435799



delta: 0.0960669311544



num_a_diff 334



delta: 6.26389029348



delta: 3.71677724778



delta: 1.8213338212



delta: 1.00011453169



delta: 0.720265551091



delta: 0.519945076943



delta: 0.376638437912



delta: 0.285048204503



delta: 0.240416289902



delta: 0.202329353904



delta: 0.170049690183



delta: 0.14280206256



delta: 0.119858323429



delta: 0.100567919418



delta: 0.0843645316417



num_a_diff 99



delta: 1.02314914656



delta: 0.283366050715



delta: 0.0993877303108



num_a_diff 10



delta: 0.128732499452



delta: 0.0387396962406



num_a_diff 5



delta: 0.0308170257162



num_a_diff 4



delta: 0.0258878875298



num_a_diff 0


In [None]:
# Compute policy iteration for example 4.2 using simpler but technically incorrect transition function

V, pi, n_iters = policy_iteration(transition_prob_and_reward_ex42_poissononly)

with open('pkl/Exercise_4.5_poissononly.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'n_iters': n_iters}, f)


## Value iteration

In [31]:
# Compute value iteration for example 4.2 using cached transition fn

V, pi, n_iters = value_iteration(transition_prob_and_reward_ex42_cached)

with open('pkl/Exercise_4.5_V_cached.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'n_iters': n_iters}, f)



delta: 299.22



delta: 109.47



delta: 83.9016



delta: 62.4605



delta: 48.0383



delta: 36.2565



delta: 30.5257



delta: 25.6084



delta: 21.449



delta: 17.9522



delta: 15.0244



delta: 12.5862



delta: 10.5536



delta: 8.84973



delta: 7.42014



delta: 6.22168



delta: 5.21667



delta: 4.37381



delta: 3.66739



delta: 3.0748



delta: 2.57803



delta: 2.16165



delta: 1.81229



delta: 1.51965



delta: 1.27423



delta: 1.06882



delta: 0.896484



delta: 0.751984



delta: 0.63089



delta: 0.529114



delta: 0.443756



delta: 0.372437



delta: 0.312347



delta: 0.262085



delta: 0.219879



delta: 0.184296



delta: 0.154755



delta: 0.12973



delta: 0.108704



delta: 0.0913391


In [17]:
# Compute value iteration for example 4.2

V, pi, n_iters = value_iteration(transition_prob_and_reward_ex42)

with open('pkl/Exercise_4.5_V.pkl', 'wb') as f:
    pickle.dump({'V': V, 'pi': pi, 'n_iters': n_iters}, f)



delta: 299.22



delta: 109.47



delta: 83.9016



delta: 62.4605



delta: 48.0383



delta: 36.2565



delta: 30.5257



delta: 25.6084



delta: 21.449



delta: 17.9522



delta: 15.0244



delta: 12.5862



delta: 10.5536



delta: 8.84973



delta: 7.42014



delta: 6.22168



delta: 5.21667



delta: 4.37381



delta: 3.66739



delta: 3.0748



delta: 2.57803



delta: 2.16165



delta: 1.81229



delta: 1.51965



delta: 1.27423



delta: 1.06882



delta: 0.896484



delta: 0.751984



delta: 0.63089



delta: 0.529114



delta: 0.443756



delta: 0.372437



delta: 0.312347



delta: 0.262085



delta: 0.219879



delta: 0.184296



delta: 0.154755



delta: 0.12973



delta: 0.108704



delta: 0.0913391


## Q-value iteration

In [33]:
# Compute Q-value iteration for example 4.2 using cached transition fn

Q, pi, n_iters = q_value_iteration(transition_prob_and_reward_ex42_cached)

with open('pkl/Exercise_4.5_Q_cached.pkl', 'wb') as f:
    pickle.dump({'Q': Q, 'pi': pi, 'n_iters': n_iters}, f)



delta: 540.28938369



delta: 66.9504961917



delta: 30.4696865738



delta: 18.8258861262



delta: 12.1337311654



delta: 7.69583259041



delta: 4.9582517269



delta: 3.3244261533



delta: 2.4932960356



delta: 2.08288492156



delta: 1.73820824712



delta: 1.44951748206



delta: 1.20820676035



delta: 1.00679238488



delta: 0.838825539251



delta: 0.698811151481



delta: 0.582128837958



delta: 0.484993566725



delta: 0.40460824257



delta: 0.337827636314



delta: 0.282155729945



delta: 0.235681852864



delta: 0.196870647655



delta: 0.164466252302



delta: 0.137648475686



delta: 0.115381432758



delta: 0.0967666822984


In [None]:
# Compute Q-value iteration for example 4.2

Q, pi, n_iters = q_value_iteration(transition_prob_and_reward_ex42)

with open('pkl/Exercise_4.5_Q.pkl', 'wb') as f:
    pickle.dump({'Q': Q, 'pi': pi, 'n_iters': n_iters}, f)


## Visualizations

In [34]:
# Visualize final value function and policy

#results_filename = 'pkl/Exercise_4.5.pkl'
#results_filename = 'pkl/Exercise_4.5_cached.pkl'
#results_filename = 'pkl/Exercise_4.5_onlypoisson.pkl'
#results_filename = 'pkl/Exercise_4.5_V.pkl'
results_filename = 'pkl/Exercise_4.5_V_cached.pkl'
with open(results_filename, 'rb') as f:
    d = pickle.load(f)
    V, pi, n_iters = d['V'], d['pi'], d['n_iters']
    
plt.matshow(V, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Value function')

plt.matshow(pi, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy')

Text(0.5,1.05,'Policy')

In [35]:
# Visualize final Q-value functions and policy

#results_filename = 'pkl/Exercise_4.5_Q.pkl'
results_filename = 'pkl/Exercise_4.5_Q_cached.pkl'
with open(results_filename, 'rb') as f:
    d = pickle.load(f)
    Q, pi, n_iters = d['Q'], d['pi'], d['n_iters']
    
for ai, a in enumerate(range(-MAX_NUM_TRANSFERS, MAX_NUM_TRANSFERS+1)):
    plt.matshow(Q[:, :, ai], origin='lower')
    plt.xlabel('Number of cars in first location')
    plt.ylabel('Number of cars in second location')
    plt.title('Q-Value function for a=%d' % a)

plt.matshow(pi, origin='lower')
plt.xlabel('Number of cars in first location')
plt.ylabel('Number of cars in second location')
plt.title('Policy function')



Text(0.5,1.05,'Policy function')

## TODOs
- exercise 4.5
- proper tqdm