### **Creating MCTSAgent that works for multiple _highway-env_ environments.**

In this milestone we intend to develop an agent that stores the trained MCTS tree for multiple _highway-env_ environment tasks. With each tree node having approximated UCT bound values after the training completion.

In [24]:
# Package download statement for highway-env only.
# We are not adding visualization import for this deliverable.
!pip install git+https://github.com/eleurent/highway-env

Collecting git+https://github.com/eleurent/highway-env
  Cloning https://github.com/eleurent/highway-env to /tmp/pip-req-build-9qujnive
  Running command git clone -q https://github.com/eleurent/highway-env /tmp/pip-req-build-9qujnive
Building wheels for collected packages: highway-env
  Building wheel for highway-env (setup.py) ... [?25l[?25hdone
  Created wheel for highway-env: filename=highway_env-1.0.dev0-cp36-none-any.whl size=80900 sha256=2540c73fff3719702eea202162d33b34ab547ec4fa346c148ec398bf08e9d676
  Stored in directory: /tmp/pip-ephem-wheel-cache-61_hx32v/wheels/e6/10/d8/02a077ca221bbac1c6fc12c1370c2f773a8cd602d4be3df0cc
Successfully built highway-env


In [25]:
# Imports required for MCTS base code execution.
import sys
import random
import itertools
from time import time
from copy import copy, deepcopy
from math import sqrt, log
# Imports required for environment build and interactions.
import gym
import highway_env

In [26]:
# Defining node class and associated required properties with it.
class Node:
    def __init__(self, parent, action):
        self.parent = parent
        self.action = action
        self.children = []
        self.explored_children = 0
        self.visits = 0
        self.value = 0

In [27]:
# HighwayAgent class creation that'll store the parent trained nodes for trained
# MCTS agents for a given instance. Adding parent objects for all environments.
class HighwayEnvAgent:
    def __init__(self):
        self.roundabout_root = Node(None, None)
        self.merge_root = Node(None, None)
        self.intersection_root = Node(None, None)
        self.lane_change_root = Node(None, None)

In [41]:
# Declaring instance of HighwayEnvAgent Agent class.
mcts_agent = HighwayEnvAgent()

In [36]:
# Basic utility functions for MCTS algorithm implementation.

# This function determine complete exhaustive list of all the nodes.
def node_expansion(space):
    if isinstance(space, gym.spaces.Discrete):
        return range(space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return itertools.product(*[node_expansion(s) for s in space.spaces])
    else:
        raise NotImplementedError

# Upper Confidence Bound U(s,a) calculation formula.
def upper_conf_bound(node):
    return node.value / node.visits + sqrt(log(node.parent.visits)/node.visits)

# Calculates the averaged out value of the total reward gains for better
# policy estimation during evaluation of search space by MCTS algorithm.
def moving_averages(v, n):
    n = min(len(v), n)
    ret = [.0]*(len(v)-n+1)
    ret[0] = float(sum(v[:n]))/n
    for i in range(len(v)-n):
        ret[i+1] = ret[i] + float(v[n+i] - v[i])/n
    return ret

In [37]:
# Executer function for training the agent and evaluating its performance as well.
class Executer:
    def __init__(self, env_name, num_execs=10, max_tree_depth=1000, episodes=10000, eval_flag=False):
        self.env_name = env_name
        self.num_execs = num_execs
        self.max_tree_depth = max_tree_depth
        self.episodes = episodes
        self.eval_flag = eval_flag
    
    def print_stats(self, num_exec, score, avg_time, eval_flag):
        sys.stdout.write('execution number: \r%3d   total reward:%10.3f   average time:%4.1f s' % (num_exec, score, avg_time))
        sys.stdout.flush()
        if (num_exec % 10) == 0:
            print("execution number: \r%4d   total reward: %4.3f   average time: %4.2f s" % (num_exec, score, avg_time))
        elif num_exec == 1 and eval_flag == True:
            print("execution number: \r%4d   total reward: %4.3f   average time: %4.2f s" % (num_exec, score, avg_time))

    def execute(self):
        print(self.env_name)
        # For maintaining list of best rewards.
        best_rewards = []
        start_time = time()
        env = gym.make(self.env_name)

        for loop in range(self.num_execs):
            env.reset()
            root = Node(None, None)
                                     
            # Before executing this snippet make sure
            # the search execution step has happened before.
            if self.eval_flag == True:
                if self.env_name == 'highway-v0':
                    root = mcts_agent.lane_change_root
                elif self.env_name == 'merge-v0':
                    root = mcts_agent.merge_root
                elif self.env_name == 'intersection-v0':
                    root = mcts_agent.intersection_root
                else:
                    root = mcts_agent.roundabout_root

            # For capturing list of best actions taken by the agent.
            best_actions = []
            best_reward = float("-inf") 

            for _ in range(self.episodes):
                state = copy(env)

                sum_reward = 0
                node = root
                terminal = False
                actions = []
                
                # The search execution step should be executed before.
                if self.eval_flag == False:
                    if self.env_name == 'highway-v0':
                        mcts_agent.lane_change_root = node
                    elif self.env_name == 'merge-v0':
                        mcts_agent.merge_root = node
                    elif self.env_name == 'intersection-v0':
                        mcts_agent.intersection_root = node
                    else:
                        mcts_agent.roundabout_root = node
                
                # Selection of suitable node children.
                while node.children:
                    if node.explored_children < len(node.children):
                        child = node.children[node.explored_children]
                        node.explored_children += 1
                        node = child
                    else:
                        node = max(node.children, key=upper_conf_bound)
                    _, reward, terminal, _ = state.step(node.action)
                    sum_reward += reward
                    actions.append(node.action)

                # Expansion of all the children nodes.
                if not terminal:
                    node.children = [Node(node, a) for a in node_expansion(state.action_space)]
                    random.shuffle(node.children)

                # Creating exhaustive list of actions.
                while not terminal:
                    action = state.action_space.sample()
                    _, reward, terminal, _ = state.step(action)
                    sum_reward += reward
                    actions.append(action)

                    if len(actions) > self.max_tree_depth:
                        sum_reward -= 100
                        break

                # Retaining the best reward value and actions.
                if best_reward < sum_reward:
                    best_reward = sum_reward
                    best_actions = actions

                # Backpropagating in MCTS for assigning reward value to a node.
                while node:
                    node.visits += 1
                    node.value += sum_reward
                    node = node.parent

            sum_reward = 0

            # best_actions list stores the estimated action
            # policy post the episode execution loop.
            print("\n execution number: "+ str(loop+1) + " || Corresponding best action sequence: "+ str(best_actions) +"\n")
            for action in best_actions:
                _, reward, terminal, _ = env.step(action)
                sum_reward += reward
                if terminal:
                    break

            best_rewards.append(sum_reward)
            score = max(moving_averages(best_rewards, 100))
            avg_time = (time() - start_time) / (loop + 1)
            self.print_stats(loop + 1, score, avg_time, self.eval_flag)

In [None]:
# Running executer instance of training and evaluating the agent for
# every discrete environment in highway-env package.
Executer(env_name='highway-v0', num_execs=10, max_tree_depth=100, episodes=5000, eval_flag=False).execute()

In [None]:
Executer(env_name='highway-v0', num_execs=1, max_tree_depth=100, episodes=5000, eval_flag=True).execute()

In [None]:
Executer(env_name='merge-v0', num_execs=10, max_tree_depth=48, episodes=2048, eval_flag=False).execute()

In [None]:
Executer(env_name='merge-v0', num_execs=1, max_tree_depth=48, episodes=2048, eval_flag=True).execute()

In [None]:
Executer(env_name='intersection-v0', num_execs=10, max_tree_depth=20, episodes=2000, eval_flag=False).execute()

In [None]:
Executer(env_name='intersection-v0', num_execs=1, max_tree_depth=20, episodes=2000, eval_flag=True).execute()

In [16]:
Executer(env_name='roundabout-v0', num_execs=10, max_tree_depth=32, episodes=786, eval_flag=False).execute()

roundabout-v0

 execution number: 1 || Corresponding best action sequence: [3, 3, 3, 3, 3, 3, 2, 3, 1, 1, 0]

  1   total reward:    10.920   average time:554.6 s
 execution number: 2 || Corresponding best action sequence: [0, 0, 3, 2, 2, 0]

  2   total reward:     5.500   average time:283.9 s
 execution number: 3 || Corresponding best action sequence: [3, 0, 3, 3, 1, 3, 1, 1, 3, 1, 3]

  3   total reward:     7.320   average time:374.9 s
 execution number: 4 || Corresponding best action sequence: [2, 3]

  4   total reward:     5.510   average time:284.5 s
 execution number: 5 || Corresponding best action sequence: [3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 3]

  5   total reward:     6.608   average time:339.9 s
 execution number: 6 || Corresponding best action sequence: [2, 0, 3]

  6   total reward:     5.520   average time:285.7 s
 execution number: 7 || Corresponding best action sequence: [0, 1, 3, 1, 4, 4, 0]

  7   total reward:     4.743   average time:247.1 s
 execution number: 8 || Cor

In [17]:
Executer(env_name='roundabout-v0', num_execs=1, max_tree_depth=32, episodes=786, eval_flag=True).execute()

roundabout-v0

 execution number: 1 || Corresponding best action sequence: [3, 3, 1, 3, 0, 3, 1, 1, 1, 3, 0]

   1   total reward: 10.920   average time: 513.03 s
