**Question 1**

We know that

$$v_0 = [10,1,0]$$
$$q_k(s,a) = R(s,a) + \sum_{a \in \{a_1,a_2\}} P(s,a,s').v_{k-1}(s') $$ 

Using the above,

$$v_1(s_1) = \max(q_1(s_1,a_1),q_1(s_1,a_2))$$
$$v_1(s_1) = \max(10.6,11.2)=11.2$$
$$\pi_1(s_1) = a_2$$

$$v_1(s_2) = \max(q_1(s_2,a_1),q_1(s_2,a_2))$$
$$v_1(s_2) = \max(4.3,4.3)=4.3$$
$$\pi_1(s_2) = a_1$$

$$v_2(s_1) = \max(q_2(s_1,a_1),q_2(s_1,a_2))$$
$$v_2(s_1) = \max(12.82,11.98)= 12.82$$
$$\pi_2(s_1) = a_1$$

$$v_2(s_2) = \max(q_2(s_2,a_1),q_(s_2,a_2))$$
$$v_2(s_2) = \max(5.65,5.89)=5.89$$
$$\pi_2(s_2) = a_2$$

For $k>2$,$v_{k-1}(s_1)\geq 12.82$ and $v_{k-1}(s_2)\geq 5.89$
$$q_k(s_1,a_1) -  q_k(s_1,a_2) = -2 + 0.1* v_{k-1}(s_1)+0.4*v_{k-1}(s_2) > 0$$

$$q_k(s_2,a_2) -  q_k(s_2,a_1) = -2 + 0.2* v_{k-1}(s_1) > 0$$

$\therefore \pi_k(s_1)=a_1$ and $\pi_k(s_2)=a_2$ for $k>2$. 

**Question 4**

In [4]:
import sys
sys.path.append("../")
from ast import Invert
from dataclasses import dataclass
from email.errors import InvalidMultipartContentTransferEncodingDefect
from typing import Tuple, Dict, Mapping
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical
from scipy.stats import poisson
import pdb


@dataclass(frozen=True)
class InventoryState:
    on_hand_1: int
    on_order_1: int
    on_hand_2: int
    on_order_2: int

    def inventory_position_for_1(self) -> int:
        return self.on_hand_1 + self.on_order_1

    def inventory_position_for_2(self) -> int:
        return self.on_hand_2 + self.on_order_2


InvOrderMapping = Mapping[
    InventoryState,
    Mapping[Tuple[int, int, int], Categorical[Tuple[InventoryState, float]]]
]


class TwoInventory(FiniteMarkovDecisionProcess[InventoryState, Tuple[int, int, int]]):

    def __init__(self,
                 c1: int,
                 c2: int,
                 h1: int,
                 h2: int,
                 lambda1: int,
                 lambda2: int,
                 K1: int,
                 K2: int,
                 p1: int,
                 p2: int
                 ):
        self.c1: int = c1
        self.c2: int = c2
        self.h1: int = h1
        self.h2: int = h2
        self.lambda1: int = lambda1
        self.lambda2: int = lambda2
        self.k1: int = K1
        self.k2: int = K2
        self.poisson1 = poisson(self.lambda1)
        self.poisson2 = poisson(self.lambda2)
        self.p1 = p1
        self.p2 = p2
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[Tuple[int, int, int],
                                     Categorical[Tuple[InventoryState, float]]]] = {}

        def isOrder(o1, o2): return 1 if (o1+o2) > 0 else 0
        def isTransfer(t): return 1 if abs(t) > 0 else 0
        for alpha1 in range(0, self.c1+1):
            for beta1 in range(self.c1+1-alpha1):
                for alpha2 in range(0, self.c2+1):
                    for beta2 in range(self.c2+1-alpha2):
                        state: InventoryState = InventoryState(
                            alpha1, beta1, alpha2, beta2)
                        ip1: int = state.inventory_position_for_1()
                        ip2: int = state.inventory_position_for_2()
                        d1: Dict[Tuple[int, int, int],
                                 Categorical[Tuple[InventoryState, float]]] = {}
                        for x in range(-1*min(alpha2, self.c1-ip1), min(alpha1, self.c2-ip2)+1):
                            for order1 in range(self.c1-ip1+x+1):
                                for order2 in range(self.c2 - ip2 - x + 1):
                                    a = (x, order1, order2)
                                    base_reward = -self.h1*alpha1 + -self.h2*alpha2 - \
                                        self.k1 * isOrder(order1, order2) - \
                                        self.k2*isTransfer(x)
                                    probs_dict: Dict[Tuple[InventoryState, float], float] = \
                                        {
                                            (InventoryState(ip1-x-i, order1, ip2+x-j, order2), base_reward):
                                            self.poisson1.pmf(
                                                i)*self.poisson2.pmf(j)
                                            for i in range(ip1-x) for j in range(ip2+x)
                                    }
                                    prob1 = 1 - self.poisson1.cdf(ip1-x-1)
                                    probs_dict.update({
                                        (InventoryState(0, order1, ip2+x-j, order2),
                                         base_reward - self.p1*(prob1*(self.lambda1-ip1+x) + (ip1-x)*self.poisson1.pmf(ip1-x))):
                                        prob1*self.poisson2.pmf(j) for j in range(ip2+x)
                                    })

                                    prob2 = 1 - self.poisson2.cdf(ip2+x-1)
                                    probs_dict.update({
                                        (InventoryState(ip1-x-i, order1, 0, order2),
                                         base_reward - self.p2*(prob2*(self.lambda2-ip2-x) + (ip2+x)*self.poisson2.pmf(ip2+x))):
                                        prob2*self.poisson1.pmf(i) for i in range(ip1-x)
                                    })
                                    probs_dict.update({
                                        (InventoryState(0, order1, 0, order2),
                                         base_reward - self.p1 *
                                         (prob1*(self.lambda1-ip1+x) +
                                          (ip1-x)*self.poisson1.pmf(ip1-x))
                                         - self.p2*(prob2*(self.lambda2-ip2-x) + (ip2+x)*self.poisson2.pmf(ip2+x))):
                                        prob1*prob2
                                    })
                                    # pdb.set_trace()
                                    d1[a] = Categorical(probs_dict)

                        d[state] = d1
        return d


if __name__ == "__main__":

    c1 = 2
    c2 = 2
    h1 = 0
    h2 = 0
    lambda1 = 1
    labmda2 = 1
    K1 = 2
    K2 = 1
    p1 = 1
    p2 = 2

    mdp = TwoInventory(
        c1=c1,
        c2=c2,
        h1=h1,
        h2=h2,
        lambda1=lambda1,
        lambda2=lambda1,
        K1=K1,
        K2=K2,
        p1=p1,
        p2=p2
    )

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(mdp, gamma=0.5)
    print(opt_vf_vi)
    print(opt_policy_vi)
    print()



MDP Value Iteration Optimal Value Function and Optimal Policy
--------------
{NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=0)): -5.871132556792174, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=1)): -4.069206256598397, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=2)): -3.1602499675952154, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=1, on_order_2=0)): -4.069206256598397, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=1, on_order_2=1)): -3.1602499675952154, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=2, on_order_2=0)): -3.1602499675952154, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=1, on_hand_2=0, on_order_2=0)): -5.009646394937202, NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=1, on_hand_2=0, on_order_2=1)): -3.138612545761457, NonTerminal(state=InventoryState(on_hand_1=0, o