In [1]:
import math
import random 
import time
import unittest
from src.maze  import MazeAction
from src.mcts  import *
from src.state import AbstractState as State
from src.state import AbstractAction as Action
from src.test  import *

# Mini-Pset: Monte Carlo Tree Search for Collaboration
<img src="img/random-xkcd.png"/>


1. [Part 1: Introduction to MCTS](#part-1)
  1. [Implement: *Selection* (10 points)](#selection)
  2. [Implement: *Expansion* (10 points)](#expansion)
  3. [Implement: *Simulation* (10 points)](#simulation)
  4. [Implement: *Back Propagation* (10 points)](#back-propagation)
  5. [Written Question: *MCTS with Heuristics* (10 points)](#heuristics)
2. [Part 2: MCTS for Collaboration](#part-2)
  1. [Implement: *Reward* (10 points)](#reward)
  2. [Implement: *Is Terminal State* (10 points)](#is-terminal-state)
  3. [Implement: *Possible Actions* (10 points)](#possible-actions)
  4. [Implement: *Take Action* (10 points)](#take-action)
  5. [Written Question: *Limitations of MCTS* (10 points)](#limitations)

---
# Part 1: Introduction to MCTS <a id="part-1"/>

## Structure of this Mini-Pset
TODO -- Finish Description

## *"Monte Carlo"* meets *"Tree Search"*
TODO -- Finish Description
 <table><tr>
    <td> <img src="img/random-pi-estimate.gif" alt="Drawing" style="width: 300px;"/> </td>
    <td> <img src="img/random-tree-search.gif" alt="Drawing" style="width: 300px;"/> </td>
</tr></table>

## AlphaGo Zero
TODO -- Finish Description

 <table><tr>
    <td> <img src="img/random-go-performance.gif" alt="Drawing" style="width: 500px;"/> </td>
    <td> <img src="img/random-lee-sedol.jpg" alt="Drawing" style="width: 400px;"/> </td>
</tr></table>

## MCTS Algorithm 
TODO -- Finish Description
<img src="img/paper-mcts.png" style="width: 700px;" >

### Bandit Problems
TODO -- Finish Description

TODO -- get cartoon bandit

*Bandit Problems* are a class of sequential decision making problems where the agent must chose between $K$ actions in order to maximize the cumulative reward of the agent.

The best action is not clear because the underlying reward distribution is unknown, and potential rewards must be estimate on past observations. This leads to the exploitation-exploration dilemma. [1] The policy should aim to minimize a player's regret after *n* plays, which is defined as $R_n$ below:

$R_n = \mu^*n - \mu_j \Sigma_{j=1}^{K} \mathbb{E}[T_k(n)]$

where $\mu^*$ is the best possible expected reward, and $\mathbb{E}[T_j(n)]$ is the expected number of executions of action $j$ across $n$ trials. In other words, the regret is the expected loss due to not acting optimally.

### Upper Confidence Bound (UCB)
TODO -- Finish Description
A useful notion for tackling bandit problems is the upper confidence bound (UCB) that a particular action is the optimal action. 

$UCB1 = \bar{X_j} + \sqrt{\frac{2 \text{ln} (n)}{n_j}}$ -> first term encourages exploitation (avg reward from arm j), and second encourages exploration (visit less visited choices) [TODO find source for this (see Auer et al)]
 
### Monte Carlo Tree Search 
TODO -- Finish Description
two fundamental concepts: the value of an action may be approximated by using random simulation, and that these values may be used efficient to adjust the policy towards a best-first strategy 
 
<img src="img/paper-tree-growth.png" style="width: 400px;" >


### Upper Confidence Bound for Trees (UCT)
TODO -- Finish Description
UCT refers to MCTS with any UCB tree selection policy. We will encode UCT with UCB1 in this problem set.
$UCT = \bar{X_j} + 2 C_p \sqrt{\frac{2 \text{ln} (n)}{n_j}}$ (n is how many times the parent node has been visited) -> yields a powerfu form of iterated local search. exploration gaurantees that low-value children are to be chosen eventually. When rewards are in the range [0, 1], Cp optimal = 0.5. 


### Default Policy: Random Rollout 
TODO -- Finish Description
<img src="img/random-tree-rollout.png" style="width: 400px;" >

## Gomoku Game
TODO -- Finish Description

---
## Implement: *Selection* (10 points) <a id="selection"/>
<img src="img/paper-selection.png" style="width: 700px;" >


TODO -- Finish Description

In [20]:
def select(node: Node, exploration_const: float = 1.0) -> (Action, Node):
    """ Select the best child node based on UCB; if there are multiple
        child nodes with the max UCB, randomly select one
    :param node: The parent node
    :param exploration_const: The exploration constant in UCB formula
    :return: The action and the corresponding best child node
    """
    ### BEGIN SOLUTION
    max_val = -math.inf
    max_actions = []
    for action, child in node.children.items():
        node_val = (child.tot_reward / child.num_samples
                    + exploration_const *
                    math.sqrt(2.0 * math.log(node.num_samples) /
                              child.num_samples))
        if node_val > max_val:
            max_val = node_val
            max_actions = [action]
        elif node_val == max_val:
            max_actions.append(action)
    max_action = random.choice(max_actions)
    return max_action, node.children[max_action]
    ### END SOLUTION

In [21]:
test_select(select)
test_ok()

---
## Implement: *Expansion* (10 points) <a id="expansion"/>
<img src="img/paper-expansion.png" style="width: 700px;" >
TODO -- Finish Description


In [2]:
def expand(node: Node) -> Node:
    """ Randomly select an untried action and create a child node based on it
        Return the new child node
    :param node: The parent node
    :return: The child node
    """
    ### BEGIN SOLUTION
    if node.is_expanded:
        raise Exception("Should not expand a node that has already"
                        " been expanded")
    action = random.choice(node.unused_edges)
    return node.add_child(action)
    ### END SOLUTION

In [3]:
test_expand(expand)
test_ok()

---
## Implement: *Simulation* (10 points) <a id="simulation"/>
<img src="img/paper-simulation.png" style="width: 700px;" >


TODO -- Finish Description


### Default Rollout Policy


TODO -- Finish Description

In [4]:
def simulate(state: State) -> float:
    """ The default policy for simulation is to randomly (uniform distribution)
        select an action to update the state and repeat the simulation until
        a terminal state is reached
    :param state: The starting state
    :return: The reward at the terminal node
    """
    ### BEGIN SOLUTION
    while not state.is_terminal:
        action = random.choice(state.possible_actions)
        state = state.execute_action(action)
    return state.reward
    ### END SOLUTION

In [5]:
test_simulate(expand)
test_ok()

---
## Implement: *Back Propagation* (10 points) <a id="back-propagation"/>
<img src="img/paper-back-propagation.png" style="width: 700px;" >


TODO -- Finish Description

In [6]:
def backpropagate(node: Node, reward: float = 0.0) -> None:
    """ Propagate the reward and sample count from the specified node
        back all the way to the root node (the node with no parent)
    :param node: The node where the reward is evaluated
    :param reward: The reward at the node
    """
    ### BEGIN SOLUTION
    while node is not None:
        node.num_samples += 1
        node.tot_reward += reward
        node = node.parent
    ### END SOLUTION

In [32]:
test_backpropagate(expand)
test_ok()

---
## Written Question: *MCTS with Heuristics* (10 points) <a id="heuristics"/>

TODO -- Description

YOUR ANSWER HERE

---
# Part 2: MCTS for Collaboration <a id="part-2"/>
<img src="img/auv-random-walk.png" style="width: 400px;" >


### AUV Value Collection Example


---
## Implement: *Reward* (10 points) <a id="reward"/>

TODO -- Description

In [33]:
def reward(self) -> float: 
    # TODO write a specification
    ### BEGIN SOLUTION
    reward = 0.0
    for target_position in self._environment.rewards:
        if target_position in self.visited:
            reward += self._environment.rewards[target_position]
    return reward
    ### END SOLUTION

In [34]:
test_reward(reward)
test_ok()

---
## Implement: *Is Terminal State* (10 points) <a id="is-terminal-state"/>

TODO -- Description

In [12]:
def is_terminal(self) -> bool:
    """ A state is terminal if and only if the time runs out
    """
    ### BEGIN SOLUTION
    return self.time_remains <= 0
    ### END SOLUTION

In [13]:
test_is_terminal(is_terminal)
test_ok()

--- 
## Implement: *Possible Actions* (10 points) <a id="possible-actions"/>

TODO -- Description

In [14]:
def possible_actions(self) -> list:
    # TODO write specification here
    ### BEGIN SOLUTION
    i, j = self._paths[self._turn][-1]
    actions = [MazeAction(self._turn, (i + 1, j)),
               MazeAction(self._turn, (i - 1, j)),
               MazeAction(self._turn, (i, j + 1)),
               MazeAction(self._turn, (i, j - 1))]
    return [MazeAction(self._turn, action.position) for action in actions if
            self.is_in_range(action.position) and
            action.position not in self._environment.obstacles]
    ### END SOLUTION

In [15]:
test_possible_actions(possible_actions)
test_ok()

---
## Implement: *Take Action* (10 points) <a id="take-action"/>

TODO -- Description

In [26]:
def take_action(self, action: MazeAction) -> "MazeState":
    """ Execute the action based on the current state
    :param action: The action
    :return: The updated state
    """
    ### BEGIN SOLUTION
    self._paths[self._turn].append(action.position)
    self._turn = (self._turn + 1) % len(self._paths)
    if self._turn == 0:  # When all agents have taken a turn of actions
        self._time_remains -= 1
    return self
    ### END SOLUTION

In [19]:
test_take_action(take_action)
test_ok()

---
## Written Question: *Limitations of MCTS* (10 points) <a id="limitations"/>

TODO -- Description

YOUR ANSWER HERE

## References 
[1] Browne et. al. 2012, *A Survey of Monte Carlo Tree Search Methods*, IEEE

TODO -- add more references