In [1]:
import numpy as np
import ipywidgets as widgets
from IPython.display import Markdown

In [2]:
class MarsRoverEnv:
    def __init__(self, num_states=5, starting_position=2, rewards=[1, 0, 0, 0, 10]):
        self.num_states = num_states
        self.starting_position = starting_position
        self.position = starting_position
        self.rewards = rewards
        self.c_steps = 0
    
    def reset(self):
        self.c_steps = 0
        self.position = self.starting_position
        return self.position
    
    def step(self, action):
        done = False
        self.c_steps += 1
        if action == 0:
            if self.position > 0:
                self.position -= 1
        elif action == 1:
            if self.position < self.num_states-1:
                self.position += 1
        else:
            print("Not a valid action")
            return
        reward = self.rewards[self.position]
        return self.position, reward, self.c_steps > 10

In [3]:
def policy_iteration_step(qs, pi, state, new_state, action, reward, gamma=0.9):
    done = False
    qs[state][action] = reward + gamma * (qs[new_state][0] + qs[new_state][1])
    new_pi = [np.random.choice(np.where(qs[state] == qs[state].max())[0]) for state in np.arange(len(qs))]
    if np.linalg.norm((np.array(pi) - np.array(new_pi)), ord=1) <= 0:
        done = True
    return qs, new_pi, done

In [4]:
def value_iteration_step(v, state, new_state, action, reward, gamma=0.9):
    done = False
    new_v = np.copy(v)
    new_v[state] = reward + gamma * v[new_state]
    if np.linalg.norm((v - new_v), ord=1) <= 0:
        done = True
    return new_v, done

In [5]:
# Change environment setup here
current_position = 2
transition_probabilities = np.ones((5,2))*0.5
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"

In [6]:
direction = widgets.ToggleButtons(
    options=['Left', 'Right'],
    description='Direction:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Rover moving one step right', 'Rover moving one step left'],
)

In [7]:
#TODO: use the picture here?
table = Markdown("""
# Which way should the rover move?
| Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|-----------|---------------------|:-------------------:|---------------------|------------------------|
|{positions[0]}|{positions[1]}|{positions[2]}|{positions[3]}|{positions[4]}|
""".format(positions=positions))

In [8]:
def adjust_position(current_position, direction, prob=(0.5,0.5)):
    follow_policy = np.random.choice([0, 1], p=[1-prob[0], prob[1]])
    if direction.value == "Left" and current_position > 0 and follow_policy:
        current_position -= 1
    elif direction.value == "Left" and current_position < 4:
        current_position += 1
    elif direction.value == "Right" and current_position < 4 and follow_policy:
        current_position += 1
    elif direction.value == "Right" and current_position > 0:
        current_position -= 1
    return current_position

In [9]:
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
display(table)


# Which way should the rover move?
| Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|-----------|---------------------|:-------------------:|---------------------|------------------------|
|||You are here|||


In [10]:
display(Markdown("""## Select a direction"""))
display(direction)
display(Markdown("""## The rover will follow your instructions with a probability of 50%"""))

## Select a direction

ToggleButtons(description='Direction:', options=('Left', 'Right'), tooltips=('Rover moving one step right', 'R…

## The rover will follow your instructions with a probability of 50%

In [11]:
current_position = adjust_position(current_position, direction, transition_probabilities[current_position])
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
table = Markdown("""
# Which way should the rover move?
| Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|-----------|---------------------|:-------------------:|---------------------|------------------------|
|{positions[0]}|{positions[1]}|{positions[2]}|{positions[3]}|{positions[4]}|
""".format(positions=positions))
display(table)


# Which way should the rover move?
| Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|-----------|---------------------|:-------------------:|---------------------|------------------------|
||||You are here||


# Interacting with the rover environment

In [12]:
rover = MarsRoverEnv()
state = rover.reset()
current_position = rover.position

In [13]:
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
table = Markdown("""
## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}|  {positions[2]}         |  {positions[3]}     |  {positions[4]}|
""".format(positions=positions))
display(table)


## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |||  You are here         |       |  |


In [14]:
display(direction)

ToggleButtons(description='Direction:', options=('Left', 'Right'), tooltips=('Rover moving one step right', 'R…

In [15]:
current_position = rover.position
if direction.value == "Left":
    action = 0
else:
    action = 1
    
new_state, reward, done = rover.step(action)
current_position = adjust_position(current_position, direction, transition_probabilities[current_position])
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
table = Markdown("""
## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}    |  {positions[2]}     |  {positions[3]}     |  {positions[4]}|   
""".format(positions=positions))
display(table)
display(Markdown("""
### State : {state}
### Action : {action}
### New State : {new_state}
### Reward:  {reward}""".format(action=action, reward=reward, state=state, new_state=new_state)))
state = new_state


## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   ||    |       |  You are here     |  |   



### State : 2
### Action : 0
### New State : 1
### Reward:  0

# Policy Iteration

In [16]:
qs = np.zeros((5, 2))
pi = np.random.randint(0, 2, size=5)
rs = [1, -1, -1, -1, 10]
rover = MarsRoverEnv(rewards=rs)
state = rover.reset()
current_position = rover.position
transition_probabilities = np.ones((5,2))*0.5

In [17]:
Markdown("""
### Current policy $\pi$ is to play:
### {pi[0]} in state 0
### {pi[1]} in state 1
### {pi[2]} in state 2
### {pi[3]} in state 3
### {pi[4]} in state 4""".format(pi=pi))


### Current policy $\pi$ is to play:
### 0 in state 0
### 1 in state 1
### 0 in state 2
### 0 in state 3
### 0 in state 4

In [18]:
Markdown("""
### The rover is in state {state}, so our policy dictates action {a}""".format(state=state, a=pi[state]))


### The rover is in state 2, so our policy dictates action 0

In [19]:
action = pi[state]
follow_policy = np.random.choice([0, 1], p=[1-transition_probabilities[state][action], transition_probabilities[state][action]])
if not follow_policy:
    action = 1 - action
new_state, reward, done = rover.step(action)
if action == 0:
    current_position = max(0, current_position-1)
else:
    current_position = min(4, current_position+1)
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
table = Markdown("""
## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}    |  {positions[2]}     |  {positions[3]}     |  {positions[4]}|   
| Q(s, a=0)  |   {qs[0][0]} |      {qs[1][0]}     |     {qs[2][0]}      |     {qs[3][0]}      | {qs[4][0]}|
| Q(s, a=1)  |   {qs[0][1]} |      {qs[1][1]}     |     {qs[2][1]}      |     {qs[3][1]}      | {qs[4][1]}|
""".format(positions=positions, qs=qs))
display(table)
display(Markdown("""
### State : {state}
### Action : {action}
### New State : {new_state}
### Reward:  {reward}""".format(action=action, reward=reward, state=state, new_state=new_state)))


## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   ||You are here    |       |       |  |   
| Q(s, a=0)  |   0.0 |      0.0     |     0.0      |     0.0      | 0.0|
| Q(s, a=1)  |   0.0 |      0.0     |     0.0      |     0.0      | 0.0|



### State : 2
### Action : 0
### New State : 1
### Reward:  -1

In [20]:
qs, pi, done = policy_iteration_step(qs, pi, state, new_state, action, reward)
state = new_state
table = Markdown("""
## Updated values
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}    |  {positions[2]}     |  {positions[3]}     |  {positions[4]}|   
| Q(s, a=0)  |   {qs[0][0]} |      {qs[1][0]}     |     {qs[2][0]}      |     {qs[3][0]}      | {qs[4][0]}|
| Q(s, a=1)  |   {qs[0][1]} |      {qs[1][1]}     |     {qs[2][1]}      |     {qs[3][1]}      | {qs[4][1]}|
""".format(positions=positions, qs=qs))
display(table)


## Updated values
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   ||You are here    |       |       |  |   
| Q(s, a=0)  |   0.0 |      0.0     |     -1.0      |     0.0      | 0.0|
| Q(s, a=1)  |   0.0 |      0.0     |     0.0      |     0.0      | 0.0|


In [21]:
Markdown("""
### Policy updates done? {done}""".format(done=done))


### Policy updates done? False

# Value iteration

In [22]:
v = np.zeros(5)
rs = [1, -1, -1, -1, 10]
rover = MarsRoverEnv(rewards=rs)
state = rover.reset()
current_position = rover.position
transition_probabilities = np.ones((5,2))*0.5

In [23]:
r1=(rs[state-1]+0.9*v[state-1])
r2=(rs[state+1]+0.9*v[state+1])
a = np.argmax([r1, r2])
if r1 == r2:
    a = np.random.randint(2)
Markdown("""
### Compute Action for current state {state}:
### a with max (R(2,a) + $\gamma$ $\sum$ V(s'))
### for all s' that can follow {state}, which are {follow1} and {follow2}.
### a=0: R(2, 0) + 0.9 * {v1} = {r1}
### a=1: R(2, 1) + 0.9 * {v2} = {r2}
### So we choose {a}.""".format(state=state, v1=v[state-1], v2=v[state+1], follow1=state-1, follow2=state+1, r1=r1, r2=r2, a=a))


### Compute Action for current state 2:
### a with max (R(2,a) + $\gamma$ $\sum$ V(s'))
### for all s' that can follow 2, which are 1 and 3.
### a=0: R(2, 0) + 0.9 * 0.0 = -1.0
### a=1: R(2, 1) + 0.9 * 0.0 = -1.0
### So we choose 1.

In [24]:
action = a
follow_policy = np.random.choice([0, 1], p=[1-transition_probabilities[state][action], transition_probabilities[state][action]])
if not follow_policy:
    action = 1 - action
new_state, reward, done = rover.step(action)
if action == 0:
    current_position = max(0, current_position-1)
else:
    current_position = min(4, current_position+1)
positions = ["", "", "", "", ""]
positions[current_position] = "You are here"
table = Markdown("""
## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}    |  {positions[2]}     |  {positions[3]}     |  {positions[4]}|   
| State values  |   {v[0]}  |      {v[1]}         |         {v[2]}      |     {v[3]}          | {v[4]}|
""".format(positions=positions, v=v))
display(table)
display(Markdown("""
### State : {state}
### Action : {action}
### New State : {new_state}
### Reward:  {reward}""".format(action=action, reward=reward, state=state, new_state=new_state)))
state = new_state


## Which way should the rover move?
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   ||You are here    |       |       |  |   
| State values  |   0.0  |      0.0         |         0.0      |     0.0          | 0.0|



### State : 2
### Action : 0
### New State : 1
### Reward:  -1

In [25]:
old_v = np.copy(v)
v, done = value_iteration_step(v, state, new_state, action, reward)
table = Markdown("""
## Updated values
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   |{positions[0]}|{positions[1]}    |  {positions[2]}     |  {positions[3]}     |  {positions[4]}|   
| State values  |   {v[0]}  |      {v[1]}         |         {v[2]}      |     {v[3]}          | {v[4]}|
""".format(positions=positions, v=v))
display(table)


## Updated values
|               | Good view | Nothing interesting | Nothing interesting | Nothing interesting | Very important science |
|:-------------:|:---------:|:-------------------:|:-------------------:|:-------------------:|:----------------------:|
|    Position   ||You are here    |       |       |  |   
| State values  |   0.0  |      -1.0         |         0.0      |     0.0          | 0.0|


In [26]:
Markdown("""
### Value updates done? {done}""".format(done=done))


### Value updates done? False