# Recycling Robot
> Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT press. (Example 3.3)

![](imgs/recycling.png)

In [1]:
from mdp.model import MDP, Transition

In [2]:
alpha, beta, r_wait, r_search = .8, .9, -1, 1 
conf = [
    Transition(0, 0, 0, 1, r_wait),
    Transition(0, 1, 0, alpha, r_search),
    Transition(0, 1, 1, 1 - alpha, r_search),
    Transition(1, 1, 1, beta, r_search),
    Transition(1, 1, 0, 1 - beta, -3),
    Transition(1, 0, 1, 1, r_wait),
    Transition(1, 2, 0, 1, 0),
]

mdp = MDP(2, 3, config=conf, gamma=1)
mdp.set_state_names(HIGH=0, LOW=1)
mdp.set_action_names(wait=0, search=1, recharge=2)

In [3]:
table = mdp.to_table()
print(table)

  state    action s_prime  probability  reward
0  HIGH      wait    HIGH          1.0    -1.0
1  HIGH    search    HIGH          0.8     1.0
2  HIGH    search     LOW          0.2     1.0
3   LOW      wait     LOW          1.0    -1.0
4   LOW    search    HIGH          0.1    -3.0
5   LOW    search     LOW          0.9     1.0
6   LOW  recharge    HIGH          1.0     0.0


### Set a policy

In [4]:
from mdp.policy import Policy
import numpy as np 

In [5]:
print(mdp.action_id)
print(mdp.state_id)

{'wait': 0, 'search': 1, 'recharge': 2}
{'HIGH': 0, 'LOW': 1}


In [6]:
class SearchPolicy(Policy):
    def __getitem__(self, state: int) -> int:
        return 1

class SearchWaitPolicy(Policy):
    def __getitem__(self, state: int) -> int:
        if state == 0:
            return 1
        else:
            return 0


In [7]:
search = SearchPolicy(mdp)
sw_policy = SearchWaitPolicy(mdp)

#### Policy Evaluation

$$
V_t(s) = Q_{t-1}(s, \pi_s)
$$
$$
Q_{t-1}(s, \pi_s) = \sum\limits_{s'} T(s, a, s')\left[ reward(s, \pi_s, s') + \gamma  V_{t-1}(s')\right]
$$

In [8]:
from mdp.algorithms import policy_evaluation

In [9]:
print(policy_evaluation(policy=search, mdp=mdp))
print(policy_evaluation(policy=sw_policy, mdp=mdp))

[734.22222222 732.88888889]
[ -990. -1000.]


#### Value Iteration
$$
    V^{*}_{t(s)} = \max\limits_a Q^{*}_{t-1}(s, a)
$$
$$
    Q^{*}_{t-1}(s, a) = \sum\limits_{s'} T(s, a, s')\left[reward(s, a, s') + \gamma  V^{*}_{t-1}(s')\right]
$$

In [10]:
from mdp.algorithms import value_iteration

In [11]:
V_star, Pi_star, value_history, policy_history = value_iteration(mdp=mdp)

In [12]:
print(V_star)
for s, a in Pi_star.items():
    print("{} {}".format(mdp.states[s], mdp.actions[a]))

[833.61222222 832.77888889]
HIGH search
LOW recharge


#### Policy Iteration

When we have a policy $\pi$, for all states $s$, we can find a greedy policy $\pi'$ by:
$$
\pi'(s) = \arg\max\limits_{a} \sum\limits_{s'}p(s'\mid s, a) \left [ r(s, a, s') + \gamma V_{\pi}(s')\right ]
$$
**Note**: suppose that $\pi'$ is not better than the old policy $\pi$. Then $V_{\pi} = V_{\pi'}$. Thus we can rewrite:
$$
V_{\pi'}(s) = \max\limits_{a} \sum\limits_{s'}p(s'\mid s, a) \left [ r(s, a, s') + \gamma V_{\pi'
}(s')\right ]
$$
Now, since (7) is the **Bellman optimality equation** $V_{\pi'}$ must be $V^*$ and both $\pi$ and $\pi'$ must be optimal policies. Thus policy improvement **always gives us a strictly better policy except when the policy is already optimal**.

In [13]:
from mdp.algorithms import policy_iteration

In [14]:
pi, history = policy_iteration(mdp)

In [15]:
for s in mdp.states:
    print("{} {}".format(mdp.states[s], mdp.actions[pi[s]]))

HIGH search
LOW recharge


In [16]:
history

[({0: 0, 1: 1}, array([-1000.,  -984.])),
 ({0: 1, 1: 1}, array([734.22222222, 732.88888889])),
 ({0: 1, 1: 2}, array([833.47222222, 832.63888889])),
 ({0: 1, 1: 2}, array([833.47222222, 832.63888889]))]