In [92]:
import sys
sys.path.append("../")
import math
import itertools
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable, Iterator, TypeVar, Mapping, List, Tuple, Sequence, Callable
from rl.chapter2.simple_inventory_mrp import *
from rl.returns import returns
from rl.td import td_prediction
from rl.distribution import Choose
from rl.monte_carlo import mc_prediction
from rl.markov_process import TransitionStep, ReturnStep
from rl.function_approx import Tabular, learning_rate_schedule, AdamGradient, FunctionApprox
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.chapter10.prediction_utils import fmrp_episodes_stream, unit_experiences_from_episodes

In [93]:
#Initializing the simple inventory mrp
capacity = 2
poisson_lambda = 1.0
holding_cost = 1.0
stockout_cost = 10.0


si_mrp = SimpleInventoryMRPFinite(capacity=capacity,
                                  poisson_lambda=poisson_lambda,
                                  holding_cost=holding_cost,
                                  stockout_cost=stockout_cost)



**Question 1**

In [98]:
#Tabular MC for prediction

S = TypeVar("S")

def tabular_mc_prediction(
                        traces : Iterable[Iterable[TransitionStep[S]]],
                        gamma = 0.7, 
                        ep_tol = 1e-6, 
                        num_traces = 1000) -> Mapping[S, float]:
    #get an iterator on returns acting on 
    episodes : Iterator[ReturnStep[S]] = (returns(trace, gamma, ep_tol) for trace in traces)
        
    
    
    #counter to count the occurence of states
    ctr = defaultdict(lambda : 0)
    mc_vf = defaultdict(float)
    
    #length of an episode
    max_steps = round(math.log(ep_tol) / math.log(gamma))
    
    #loop over all episodes and the all the states in each episode to accumalate the returns
    for ep in itertools.islice(episodes, num_traces):
        for st in itertools.islice(ep, max_steps):
            mc_vf[st.state] += st.return_ 
            ctr[st.state] += 1
            
    #take average of the accumalated returns
    for st in mc_vf:
        mc_vf[st] = round(mc_vf[st]/ctr[st],2)
    return mc_vf

**Question 2**

In [99]:
#Tabular TD for prediction
def tabular_td_prediction(transitions: Iterable[TransitionStep[S]], 
                          gamma : float = 0.9,
                          l = learning_rate_schedule(0.01, 10000.0, 0.5),
                         total_steps = 1000) -> Mapping[S, float]:
    transitions = itertools.islice(transitions, total_steps)
    td_vf = defaultdict(float)
    for i, st in enumerate(transitions):
        td_vf[st.state] += l(i) * (st.reward + gamma * td_vf[st.next_state] - td_vf[st.state])

    
    return td_vf

**Pending: implement both functions that return an iterator and not a dictionary**

**Question 3**

In [102]:
gamma = 0.7
traces = si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
num_traces=10000
mc = mc_prediction(             
                    traces=traces,
                    approx_0=Tabular(),
                    γ=gamma, 
    episode_length_tolerance=1e-6
    )

*_, mc_approx = itertools.islice(mc, num_traces)
print("MC with approx")
pprint({s : round(mc_approx.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states})
print("\n\nTabular MC")
tmc_vf = tabular_mc_prediction(
                        traces,
                        gamma = gamma, 
                        ep_tol = 1e-6, 
                        num_traces = num_traces)
pprint(tmc_vf)

MC with approx
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -8.445,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -8.277,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -9.284,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -9.458,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -15.795,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -10.27}


Tabular MC
defaultdict(<class 'float'>,
            {NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -10.26,
             NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -8.44,
             NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -9.45,
             NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -15.78,
             NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -8.26,
             NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -9.27})


In [97]:
episodes = si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
num_ep = 1000
transitions = unit_experiences_from_episodes(episodes, num_ep)

l = learning_rate_schedule(0.01, 1000, 0.5)

td_iterator= td_prediction(transitions=transitions,
                           approx_0=Tabular(),
                           γ=gamma)
total_steps = 100*num_ep
*_, td_approx = itertools.islice(td_iterator, total_steps)
print("TD with approx")
pprint({s : round(td_approx.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states})

print("\n\nTabular TD")
td_vf = tabular_td_prediction(transitions=transitions, 
                          gamma = gamma,
                          l = l,
                         total_steps= total_steps)
pprint(td_vf)

TD with approx
{NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -8.138,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -7.955,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -8.993,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -9.134,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -15.499,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -9.985}


Tabular TD
defaultdict(<class 'float'>,
            {NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -10.393844373699348,
             NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -8.435574373336676,
             NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -15.882757986012924,
             NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -8.44001749159503,
             NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -9.269433595520901,
             NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -9.4994878707

**We get the same results using tabular method and with approximation (linear) for both MC and TD.**