In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

# Ricycling robot
The story of a robot that collects empty cans in a office...
- **States**: `H` (high energy), `L` (low energy)
- **Actions**:
    - Actions(`H`) = `search`, `wait` 
    - Actions(`L`) = `search`, `wait`, `recharge` 
    
**Transitions and rewards**

| s  |    a     | s' | P(s' \| s, a) | r(s, a, s') |
| :--: | :--------: | :--: | :---------------: | :-----------: |
| `H`  |  `search`  | `H`  |     $\alpha$      | $r_{search}$  |
| `H`  |  `search`  | `L`  |    $1-\alpha$     | $r_{search}$  |
| `L`  |  `search`  | `H`  |     $1-\beta$     |      -3       |
| `L`  |  `search`  | `L`  |      $\beta$      | $r_{search}$  |
| `H`  |   `wait`   | `H`  |         1         |  $r_{wait}$   |
| `H`  |   `wait`   | `L`  |         0         |  $r_{wait}$   |
| `L`  |   `wait`   | `H`  |         0         |  $r_{wait}$   |
| `L`  |   `wait`   | `L`  |         1         |  $r_{wait}$   |
| `L`  | `recharge` | `H`  |         1         |       0       |
| `L`  | `recharge` | `L`  |         0         |       0       |


> Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT press. p. 52

## Init Recycling Robot with some parameters

In [3]:
from utils import transitions_table, mdp_to_graph, plot_mdp
from recycling_robot import RecyclingMDP

In [4]:
alpha, beta, r_search, r_wait = .8, .9, 1, -1
mdp = RecyclingMDP(alpha=alpha, beta=beta, r_search=r_search, r_wait=r_wait, gamma=.99)

In [5]:
T = transitions_table(mdp)

In [6]:
T

Unnamed: 0,from_state,action,to_state,reward,probability
0,H,search,H,1,0.8
1,H,search,L,1,0.2
2,H,wait,H,-1,1.0
3,H,wait,L,-1,0.0
4,L,search,H,-3,0.1
5,L,search,L,1,0.9
6,L,recharge,H,0,1.0
7,L,recharge,L,0,0.0
8,L,wait,H,-1,0.0
9,L,wait,L,-1,1.0


In [7]:
net = plot_mdp(mdp_to_graph(mdp))
net.show('recycling-robot.html')

## Policy
Let's define a policy to try as follows: we always perform `search` when we are `H`, but when we are `L` we `wait`

In [8]:
from models import Policy

In [9]:
class TestPolicy(Policy):
    def __init__(self, mdp):
        super().__init__(mdp=mdp)
    def action(self, state):
        if state == 'H':
            return 'search'
        else:
            return 'wait'

In [10]:
pi = TestPolicy(mdp=mdp)

In [11]:
episodes = [pi.episode(max_len=10) for i in range(10)]
for episode in episodes:
    print(episode[0][0], " ".join(
        ["{} => {}".format(a[0], s_p) for s, a, s_p, r in episode]), 
          " U({})".format(pi.utility(episode)))

H s => L w => L w => L w => L w => L w => L w => L w => L w => L w => L  U(-7.5617924991195515)
H s => H s => H s => H s => L w => L w => L w => L w => L w => L w => L  U(-1.6809944991195498)
H s => H s => H s => H s => L w => L w => L w => L w => L w => L w => L  U(-1.6809944991195498)
H s => H s => H s => L w => L w => L w => L w => L w => L w => L w => L  U(-3.6215924991195507)
H s => L w => L w => L w => L w => L w => L w => L w => L w => L w => L  U(-7.5617924991195515)
H s => H s => H s => H s => H s => H s => L w => L w => L w => L w => L  U(2.142177620680448)
H s => H s => H s => H s => H s => H s => L w => L w => L w => L w => L  U(2.142177620680448)
H s => H s => H s => L w => L w => L w => L w => L w => L w => L w => L  U(-3.6215924991195507)
H s => H s => H s => H s => H s => H s => L w => L w => L w => L w => L  U(2.142177620680448)
H s => H s => L w => L w => L w => L w => L w => L w => L w => L w => L  U(-5.581792499119551)


## Policy evaluation

In [12]:
from algorithms import policy_evaluation

In [13]:
evaluation = policy_evaluation(policy=pi, mdp=mdp, epsilon=1e-10)

In [14]:
evaluation

{'H': -90.3802982598742, 'L': -99.99568287525882}

## Value iteration

In [15]:
from IPython.display import display, clear_output
from algorithms import value_iteration
from utils import show_value_iterations

In [16]:
optimal_value, optimal_policy, value_history, policy_history = value_iteration(mdp=mdp, epsilon=1e-10)

In [17]:
optimal_value, optimal_policy

({'H': 83.46886263527185, 'L': 82.63413809437034},
 {'H': 'search', 'L': 'recharge'})

In [18]:
show_value_iterations(value_history, policy_history)

Unnamed: 0,S,V,A
0,H,21.936115,search
1,L,21.101391,recharge


quit


## Policy iteration

In [19]:
from algorithms import policy_iteration
from utils import show_policy_iterations

In [20]:
pi, pi_history = policy_iteration(mdp)

In [21]:
pi.actions

{'H': 'search', 'L': 'recharge'}

In [22]:
show_policy_iterations(pi_history)

Unnamed: 0,S,V,A
0,H,83.468856,search
1,L,82.634132,recharge



