In [116]:
import os
os.chdir('/Users/Alex/Desktop/Documents_4A/Winter_quarter_1/MS&E_346/RL_book/')

## Question 1 : Tabular GLIE Monte Carlo Control

In [133]:
from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap
capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0
gamma: float = 0.9
si_mdp: SimpleInventoryMDPCap = SimpleInventoryMDPCap(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

In [134]:
from rl.dynamic_programming import value_iteration_result
from pprint import pprint

true_opt_vf, true_opt_policy = value_iteration_result(si_mdp, gamma=gamma)

print("True Optimal Value Function")
pprint(true_opt_vf)
print("True Optimal Policy")
print(true_opt_policy)

True Optimal Value Function
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.66095964467877,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.894855194671294,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.99189950444479,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.99189950444479,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.991899504444792,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.66095964467877}
True Optimal Policy
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



In [135]:
#Run from repo code

from rl.function_approx import Tabular
from rl.distribution import Choose
from rl.chapter3.simple_inventory_mdp_cap import InventoryState
from typing import Iterable, Iterator, TypeVar, Callable, Mapping, Tuple
from rl.markov_process import TransitionStep, NonTerminal, FiniteMarkovProcess
from rl.chapter10.prediction_utils import fmrp_episodes_stream
from rl.chapter10.prediction_utils import unit_experiences_from_episodes
from rl.function_approx import learning_rate_schedule
from rl.monte_carlo import glie_mc_control
from rl.approximate_dynamic_programming import (ValueFunctionApprox,
                                                QValueFunctionApprox,
                                                NTStateDistribution)


episode_length_tolerance: float = 1e-5
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -0.5
initial_learning_rate: float = 0.1
half_life: float = 10000.0
exponent: float = 1.0
    
initial_qvf_dict: Mapping[Tuple[NonTerminal[InventoryState], int], float] = {
    (s, a): 0. for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)
}
    
learning_rate_func: Callable[[int], float] = learning_rate_schedule(
    initial_learning_rate=initial_learning_rate,
    half_life=half_life,
    exponent=exponent
)
    
qvfs: Iterator[QValueFunctionApprox[InventoryState, int]] = glie_mc_control(
    mdp=si_mdp,
    states=Choose(si_mdp.non_terminal_states),
    approx_0=Tabular(
        values_map=initial_qvf_dict,
        count_to_weight_func=learning_rate_func
    ),
    gamma=gamma,
    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
    episode_length_tolerance=episode_length_tolerance
)

In [138]:
#fetch the last iteration

from rl.distribution import Constant
from rl.dynamic_programming import V
import itertools
import rl.iterate as iterate
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy

A = TypeVar('A')
S = TypeVar('S')

num_episodes = 1000

final_qvf: QValueFunctionApprox[InventoryState, int] = iterate.last(itertools.islice(qvfs, num_episodes))
    
def get_vf_and_policy_from_qvf(
    mdp: FiniteMarkovDecisionProcess[S, A],
    qvf: QValueFunctionApprox[S, A]
) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FiniteDeterministicPolicy[S, A] = \
        FiniteDeterministicPolicy({
            s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1]
            for s in mdp.non_terminal_states
        })
    return opt_vf, opt_policy

opt_vf, opt_policy = get_vf_and_policy_from_qvf(
    mdp=si_mdp,
    qvf=final_qvf
)

print(f"GLIE MC Optimal Value Function with {num_episodes:d} episodes")
pprint(opt_vf)
print(f"GLIE MC Optimal Policy with {num_episodes:d} episodes")
print(opt_policy)

GLIE MC Optimal Value Function with 1000 episodes
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.26305099605248,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.05856049295257,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.666669208210045,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.02018798023223,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.56475740041169,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.8983792198719}
GLIE MC Optimal Policy with 1000 episodes
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



In [144]:
#Tabular from scratch

from typing import Sequence
from rl.returns import returns
from rl.monte_carlo import epsilon_greedy_policy
import numpy as np


episode_length_tolerance: float = 1e-5
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -0.5
initial_learning_rate: float = 0.1
half_life: float = 10000.0
exponent: float = 1.0


def get_returns(trace,gamma,episode_length_tolerance):
    l = []
    trace = iter(trace)
    max_steps = int(np.log(episode_length_tolerance) / np.log(gamma)) if gamma < 1 else None
    if max_steps is not None:
        trace = list(itertools.islice(trace, int(max_steps * 2) ))
    for i, T in enumerate(trace): #T is transition step type 
        if i < max_steps :
            ret = sum(A.reward*(gamma**q) for q,A in enumerate(trace[i:]))
            l.append((T.state,ret))
    return l

def get_glie_mc_control(mdp,initial_qf,max_episodes,gamma, episode_length_tolerance):
    qf : QValueFunctionApprox[S,A] = Tabular(values_map=initial_qvf_dict)
    p: Policy[S, A] = epsilon_greedy_policy(qf, mdp, 1.0)
    nb_episodes = 0
    count : Mapping[Tuple[State,Action],float] = {(s, a): 0. for s in mdp.non_terminal_states for a in si_mdp.actions(s)}
    while True:
        trace: Iterable[TransitionStep[S, A]] = mdp.simulate_actions(Choose(mdp.non_terminal_states), p)
        nb_episodes +=1
        if nb_episodes > max_episodes :
            break
        qf_dict = dict(qf.values_map)
        for step in returns(trace, gamma, episode_length_tolerance) :
            count[(step.state,step.action)] += 1
            qf_dict = dict(qf.values_map)
            qf_dict[(step.state,step.action)] = qf_dict[(step.state,step.action)] + (step.return_ - qf_dict[(step.state,step.action)])/count[(step.state,step.action)]
        qf = Tabular(values_map = qf_dict)
        p = epsilon_greedy_policy(qf, mdp, epsilon_as_func_of_episodes(num_episodes))
    return qf


In [146]:
from rl.function_approx import FunctionApprox

QValueFunctionApprox = FunctionApprox[Tuple[NonTerminal[S], A]]

initial_qf_dict: Mapping[Tuple[NonTerminal[InventoryState], int], float] = {
    (s, a): 0. for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)
}
    
glie_mc = get_glie_mc_control(si_mdp,initial_qf = initial_qf_dict,max_episodes =1000,gamma = gamma,episode_length_tolerance=episode_length_tolerance)

In [147]:
my_opt_vf, my_opt_policy = get_vf_and_policy_from_qvf(
    mdp=si_mdp,
    qvf=glie_mc
)

print(f"MY GLIE MC Optimal Value Function with {num_episodes:d} episodes")
pprint(my_opt_vf)
print(f"MY GLIE MC Optimal Policy with {num_episodes:d} episodes")
print(my_opt_policy)

MY GLIE MC Optimal Value Function with 1000 episodes
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -1.9385957175473876,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -2.2435490662160076,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -2.8510210598685144,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -0.6403732022785171,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -1.034839098782903,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -1.898992933665553}
MY GLIE MC Optimal Policy with 1000 episodes
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0

