In [1]:
import sys 
sys.path.append("../")
from rl.chapter3.simple_inventory_mdp_cap import *
from rl.dynamic_programming import value_iteration_result
import numpy as np 
from pprint import pprint
from typing import TypeVar, Iterable, Mapping, Dict, Callable, Iterator
import matplotlib.pyplot as plt 
import rl.markov_process as mp
import itertools
import rl.markov_decision_process as mdp
import rl.monte_carlo as mc
import rl.td as td 
from rl.distribution import Choose, Categorical
from rl.function_approx import LinearFunctionApprox, Tabular
import rl.chapter11.control_utils as control
import rl.iterate as iterate
import rl.policy as policy
from rl.approximate_dynamic_programming import QValueFunctionApprox, NTStateDistribution

First finding DP value iteration optimal action and vf for simple inventory MDP

In [2]:
#Initializing the simple inventory mrp
capacity = 2
poisson_lambda = 1.0
holding_cost = 1.0
stockout_cost = 10.0
gamma=0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=capacity,
        poisson_lambda=poisson_lambda,
        holding_cost=holding_cost,
        stockout_cost=stockout_cost
    )

print("MDP Policy Iteration Optimal Value Function and Optimal Policy")

opt_vf_pi, opt_policy_pi = value_iteration_result(si_mdp, gamma=gamma)

pprint({k : round(v, 3) for k, v in opt_vf_pi.items()})
print()
print(opt_policy_pi)

MDP Policy Iteration Optimal Value Function and Optimal Policy
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.895,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.661,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.992,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.661,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.992,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.992}

For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



**Question 2**


Finding the optimal policy and vf using SARSA.

In [12]:
ntr = 100000
nep = 1000000

approx_0 = Tabular()

sarsa_qvf = td.glie_sarsa(mdp=si_mdp, states=Choose(si_mdp.non_terminal_states), approx_0=approx_0,
                          γ=gamma, ϵ_as_func_of_episodes=lambda k : 1. / k, max_episode_length=nep)

*_, qvf = itertools.islice(sarsa_qvf, ntr)
opt_vf, opt_policy = control.get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)

pprint({s : round(v, 3) for s, v in opt_vf.items()})
print()
print(opt_policy)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -29.624,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -23.576,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -23.785,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -24.537,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -24.869,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -25.825}

For State InventoryState(on_hand=0, on_order=0): Do Action 2
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



**Question 1**


Finding the optimal policy and vf using MC Control.

In [5]:
ntr = 1000

approx_0 = Tabular()

mc_qvf = mc.glie_mc_control(mdp=si_mdp, states=Choose(si_mdp.non_terminal_states), approx_0=approx_0,
                          γ=gamma, ϵ_as_func_of_episodes=lambda k : 1. / k, episode_length_tolerance = 1e-6)

*_, qvf = itertools.islice(mc_qvf, ntr)
opt_vf, opt_policy = control.get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)

pprint({s : round(v, 3) for s, v in opt_vf.items()})
print()
print(opt_policy)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.621,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.983,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.469,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -29.063,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.463,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.347}

For State InventoryState(on_hand=0, on_order=0): Do Action 2
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



**Pending:** Extending the approx impementations to support backward iteration to solve the asset alloc problem