In [None]:
!git clone http://github.com/HumanCompatibleAI/imitation
!pip install imitation/

Cloning into 'imitation'...
remote: Enumerating objects: 15710, done.[K
remote: Counting objects: 100% (3507/3507), done.[K
remote: Compressing objects: 100% (1244/1244), done.[K
remote: Total 15710 (delta 2404), reused 2970 (delta 2025), pack-reused 12203[K
Receiving objects: 100% (15710/15710), 28.56 MiB | 22.19 MiB/s, done.
Resolving deltas: 100% (10840/10840), done.
Processing ./imitation
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gymnasium[classic-control]~=0.29 (from imitation==0.4.1.dev226704798)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting seals~=0.2.1 (from imitation==0.4.1.dev226704798)
  Downloading seals-0.2.1-py3-none-any.whl (35 kB)
Co

In [None]:
!pip install stable_baselines3
!pip install networkx
!pip install torch
!pip install numpy
!pip install gymnasium
!pip install nltk
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import networkx as nx
import numpy as np


class StateDiGraph:

    def __init__(self, map_csv: str, allowed_values: list):
        assert isinstance(map_csv, str) and os.path.isfile(map_csv), "Error with map name argument!"
        # assert isinstance(expert_csv, str) and os.path.isfile(expert_csv), "Error with expert data argument"
        assert all(isinstance(item, int) for item in allowed_values), "Error! allowed_values must be a list of ints"

        self.map_csv = np.genfromtxt(map_csv, delimiter=",")
        self.allowed_values = allowed_values

        # Get the dimensions of the array
        rows, cols = self.map_csv.shape

        # Create an array of state values
        # state_array = np.arange(0, rows * cols).reshape(rows, cols)

        # Create an empty directed graph
        self.graph = nx.DiGraph()

        # Iterate through the array
        for i in range(rows):
            for j in range(cols):
                cell_value = self.map_csv[i, j]

                # Check if the cell value is in the allowed_values list
                if cell_value in self.allowed_values:
                    # Add the cell value as a node to the graph
                    node_name = f"{i},{j}"
                    self.graph.add_node(node_name, value=cell_value)

                    # Check and add directed edges to neighboring cells
                    neighbors = [(i - 1, j), (i + 1, j), (i, j - 1), (i, j + 1)]
                    for ni, nj in neighbors:
                        if 0 <= ni < rows and 0 <= nj < cols and self.map_csv[ni, nj] in allowed_values:
                            neighbor_name = f"{ni},{nj}"
                            self.graph.add_edge(node_name, neighbor_name)

        # Create the distance dictionary
        self.distance_dict = dict(nx.all_pairs_dijkstra_path_length(self.graph))




In [None]:
import gymnasium as gym
import numpy as np
import os
from abc import ABC
from gymnasium import spaces
from scipy.spatial.distance import cityblock
import networkx as nx

LAND = 0
HOUSE = 1
ROAD = 2
TREE = 3
START = 4
EXIT = 5
ROCK = 6
AMMO = 7
TANK = 9


def reward_normalize(distance: int, max_value: int):
    assert isinstance(distance, int), "Error distance must be an int"
    assert isinstance(max_value, int), "Error max_value must be an int"
    return 1 - (distance / max_value)


def get_subarray(large_array, row_idx, col_idx, subarray_size=20):
    array_shape = large_array.shape
    subarray_radius = subarray_size // 2

    # Calculate the range of rows and columns to extract
    row_start = max(row_idx - subarray_radius, 0)
    row_end = min(row_idx + subarray_radius + 1, array_shape[0])
    col_start = max(col_idx - subarray_radius, 0)
    col_end = min(col_idx + subarray_radius + 1, array_shape[1])

    # Extract the subarray
    subarray = large_array[row_start:row_end, col_start:col_end]

    return subarray, (row_start, col_start)


def get_state_string(first, second):
    return f"{first},{second}"


class FullAnchoringBaseline(gym.Env):
    metadata = {"render_modes": ["console"]}

    def __init__(self, map_csv: str, render_mode=None):
        assert isinstance(map_csv, str) and os.path.isfile(map_csv), "Error with map_csv argument!"
        assert render_mode is None or render_mode in self.metadata["render_modes"]

        self.render_mode = render_mode
        self._map_name = map_csv

        # Load the map and set some local variables related to the map csv such as size, tank count, etc.
        self.map_csv = np.genfromtxt(map_csv, delimiter=",")
        row, col = self.map_csv.shape
        self._size = int(row * col)
        self._num_rows = row
        self._num_cols = col
        self._tank_count = int(np.count_nonzero(self.map_csv == TANK) / 2)  # How many tanks for each half of the map
        self._agent_location = np.argwhere(self.map_csv == START)[0]  # row, col
        self._untraversable = [HOUSE, TREE, START, ROCK]  # What the agent cannot traverse
        self._state_array = np.arange(0, row * col).reshape(row, col)  # State number table, used for returning obs
        self._agent_top = None  # Determines if the agent is working the top half or bottom half of map
        self._tank_list = []  # The tanks from the map
        self._tanks_destroyed = []  # The list of tanks that have been destroyed
        self._reward_matrix = {}  # The reward matrix used for reward value lookup
        self._state_graph = StateDiGraph(map_csv, [LAND, ROAD, AMMO, TANK, START, EXIT])  # Digraph of the env
        self._target = None  # The current target of the enemy.  Either a tank, or the exit
        self.episode_length = 0  # Initialize episode length

        # We split the map into two different arrays: the top half and the bottom half
        mid_row = 10
        top_half = self.map_csv[:mid_row, :]
        bottom_half = self.map_csv[mid_row + 1:, :]

        # Now we get the tank locations for the top half and the bottom half, compensating for the bottom offset
        self._tank_loc_top = np.argwhere(top_half == TANK)
        self._tank_loc_bottom = np.argwhere(bottom_half == TANK)
        self._tank_loc_bottom = np.array([(lambda x: [x[0] + mid_row + 1, x[1]])(x) for x in self._tank_loc_bottom])

        # Get the exit locations for the top half and the bottom half
        self._exit_loc_top = np.argwhere(top_half == EXIT)
        self._exit_loc_bottom = np.argwhere(bottom_half == EXIT)
        self._exit_loc_bottom = np.array([(lambda x: [x[0] + mid_row + 1, x[1]])(x) for x in self._exit_loc_bottom])

        # Calculate the reward matrix.  The reward matrix is used instead of calculated reward values in real time.
        # Dictionary look up is much faster, however it can be a memory hog, but we are using servers with high RAM.
        # Reward values are calculated to each tank on the map, and each of the exits, even though only half of them
        # will be used.
        self._get_reward_matrix(top_half, self._exit_loc_top)
        self._get_reward_matrix(top_half, self._tank_loc_top)
        self._get_reward_matrix(bottom_half, self._exit_loc_bottom, True)
        self._get_reward_matrix(bottom_half, self._tank_loc_bottom, True)

        # Observations are dictionaries with the agent, tank, and exit locations.
        # We use self._size + 1, so that the highest value signifies a null value since spaces does not allow null

        self.observation_space = spaces.MultiDiscrete([self._size, self._size + 1, self._size + 1])
        # There are 4 actions corresponding to "right", "up", "left", "down", "fire"
        self.action_space = spaces.Discrete(5)

        """
        The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken. [row, col]
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """
        self._action_to_direction = {
            0: np.array([0, 1], dtype=int),  # Right
            1: np.array([-1, 0], dtype=int),  # Up
            2: np.array([0, -1], dtype=int),  # Left
            3: np.array([1, 0], dtype=int),  # Down
        }

    def _check_radar(self) -> bool:

        subarray_size = 2

        if self._agent_top is None:
            vision_array = np.array([])
        elif self._agent_top:
            vision_array, (start_row, start_col) = get_subarray(self.map_csv[:10, :], self._agent_location[0],
                                                                self._agent_location[1], subarray_size=subarray_size)
        elif not self._agent_top:
            vision_array, (start_row, start_col) = get_subarray(self.map_csv[11:, :], self._agent_location[0] - 11,
                                                                self._agent_location[1], subarray_size=subarray_size)
            start_row += 11
        else:
            vision_array = np.array([])

        tank_locations = np.argwhere(vision_array == TANK)

        # If the tank is within the 3 x 3 vision field, destroy the first tank
        # and set its value on the map csv to 0 to prevent re-targeting
        if len(tank_locations) > 0:
            tank = tank_locations[0]
            row_in_subarray, col_in_subarray = tank[0], tank[1]

            row_tank_map_loc = start_row + row_in_subarray
            col_tank_map_loc = start_col + col_in_subarray

            location = [row_tank_map_loc, col_tank_map_loc]
            if location not in self._tanks_destroyed:
                self._tanks_destroyed.append(location)
                self.map_csv[row_tank_map_loc][col_tank_map_loc] = 0
                return True
        else:
            return False

    def _get_closest_objective(self):

        if len(self._tanks_destroyed) == self._tank_count:

            if self._agent_top:
                self._target = self._exit_loc_top[0]
            elif not self._agent_top:
                self._target = self._exit_loc_bottom[0]
            else:
                self._target = None

        # Anything else, the target should be the closest tank
        else:

            # Make sure the tank list is occupied
            if self._tank_list is not None:

                min_distance = float('inf')
                current_location = get_state_string(self._agent_location[0], self._agent_location[1])

                for tank in self._tank_list:

                    if tank not in self._tanks_destroyed:

                        tank_name = get_state_string(tank[0], tank[1])
                        # Calculate primary distance using dijkstra
                        distance = self._state_graph.distance_dict[current_location][tank_name]
                        # Calculate secondary distance using manhattan
                        manhattan_distance = cityblock(self._agent_location, tank)
                        # Combine primary and secondary distance measures
                        combined_distance = 0.7 * distance + 0.3 * manhattan_distance
                        if combined_distance < min_distance:
                            min_distance = combined_distance
                            self._target = tank
            else:
                Exception("Error! Tank list is empty in get closest objective method")

    def _get_info(self) -> dict:
        """
        Returns any needed additional information being passed from the environment
        :return: A dictionary containing key value pairs
        """
        return {}

    def _get_obs(self) -> dict:
        """
        Returns the observation information of the environment.  Converts the cell location to a state number for
        the agent, the tank locations, and the exit locations.  Returns these values as a dictionary.
        :return:
        """
        # Get the agent's state and the exit state
        agent_state = self._get_state(self._agent_location)
        if self._agent_top is None:
            exit_state = self._size  # Null value
            tank_states = [self._size for _ in range(self._tank_count)]
        elif self._agent_top:
            exit_state = self._get_state(self._exit_loc_top[0])
            tank_states = []
            for tank in self._tank_list:
                tank_states.append(self._get_state(tank))
        elif not self._agent_top:
            exit_state = self._get_state(self._exit_loc_bottom[0])
            tank_states = []
            for tank in self._tank_list:
                tank_states.append(self._get_state(tank))
        else:
            exit_state = self._size  # Null value
            tank_states = [self._size for _ in range(self._tank_count)]

        # Get the current target
        if self._target is None:
            target_state = 840
        else:
            target_state = self._get_state(self._target)

        return np.array([agent_state, target_state, exit_state])

    def _get_reward(self) -> float:
        """
        Return the reward based upon the agent's location and discoveries.  If no tanks have been discovered
        then the returned reward is 0.0.  If there is at least 1 tank that is discovered, the highest reward
        is returned based upon the self._reward_matrix
        :return: Float value indicating reward
        """
        # If the agent is still at the starting position
        if self._agent_top is None:
            return 0.0

        # If the agent is navigating the top, get the reward
        elif self._agent_top:

            # If all the tanks are destroyed, the reward is calculated to the exit
            agent_location = get_state_string(self._agent_location[0], self._agent_location[1])
            if len(self._tanks_destroyed) == self._tank_count:
                if np.array_equal(self._agent_location, self._exit_loc_top[0]):
                    return 1000
                else:
                    exit_location = get_state_string(self._exit_loc_top[0][0], self._exit_loc_top[0][1])
                    return self._reward_matrix[exit_location][agent_location]
            # The reward is to the closest tank
            elif self._target is not None:
                tank_location = get_state_string(self._target[0], self._target[1])
                return self._reward_matrix[tank_location][agent_location]
            # Everything else is zero
            else:
                return 0.0

        # The agent is navigating the bottom get the reward
        elif not self._agent_top:

            agent_location = get_state_string(self._agent_location[0], self._agent_location[1])
            if len(self._tanks_destroyed) == self._tank_count:
                if np.array_equal(self._agent_location, self._exit_loc_bottom[0]):
                    return 1000
                else:
                    exit_location = get_state_string(self._exit_loc_bottom[0][0], self._exit_loc_bottom[0][1])
                    return self._reward_matrix[exit_location][agent_location]
            elif self._target is not None:
                tank_location = get_state_string(self._target[0], self._target[1])
                return self._reward_matrix[tank_location][agent_location]
            else:
                return 0.0

        # Otherwise...
        else:
            return 0.0

    def _get_reward_matrix(self, array: np.ndarray, map_object: np.ndarray, is_bottom: bool = False):
        """
        Adds a dictionary of reward values to the self._reward_matrix dictionary.  The key in the
        self._reward_matrix is a string: 'row,col' corresponding the location for each element
        of the map_object numpy 1-dimensional array.  The value for each key is a dictionary.
        This dictionary has keys corresponding to each cell from the numpy array parameter array.
        The values for each of these keys are determined by finding the normalized distance
        between the cell and the element of the map_object using dijkstra's shortest path, and
        subtracting this distance from 1 to get the reward value.
        :param array: The numpy array associated with the current map, either top or bottom
        :param map_object: A list of targets for the player, usually the tanks or the exit for each top/bottom map
        :return: None
        """
        assert isinstance(array, np.ndarray), "Error! array must be a numpy array"
        assert isinstance(map_object, np.ndarray), "Error! map_object must be a numpy array"

        # First we create a graph network for the passed map, and create a distance dictionary
        rows, cols = array.shape

        graph = nx.DiGraph()

        # Iterate through the array
        for i in range(rows):
            for j in range(cols):
                cell_value = self.map_csv[i, j]

                # Check if the cell value is in the allowed_values list
                if cell_value not in self._untraversable:
                    # Add the cell value as a node to the graph
                    if is_bottom:
                        node_name = get_state_string(i + 11, j)
                    else:
                        node_name = get_state_string(i, j)
                    graph.add_node(node_name, value=cell_value)

                    # Check and add directed edges to neighboring cells
                    neighbors = [(i - 1, j), (i + 1, j), (i, j - 1), (i, j + 1)]
                    for ni, nj in neighbors:
                        if 0 <= ni < rows and 0 <= nj < cols and array[ni, nj] not in self._untraversable:
                            if is_bottom:
                                neighbor_name = get_state_string(ni + 11, nj)
                            else:
                                neighbor_name = get_state_string(ni, nj)
                            graph.add_edge(node_name, neighbor_name)

        distance_dict = dict(nx.all_pairs_dijkstra_path_length(graph))
        del graph
        # Begin by iterating through each element of the map_object
        for element in map_object:
            # This is the dictionary that will be attached to each key in the self._reward_matrix
            # Each key corresponds to each element of the map_object.
            result_dict = {}
            location = get_state_string(element[0], element[1])
            # Get the distance dictionary from the state digraph
            element_distance_dict = distance_dict[location]
            # Step value, based on the largest distance to the element
            element_step_value = 1 / max(element_distance_dict.values())

            rows, cols = array.shape
            # Iterate through each cell of the passed array
            for i in range(rows):
                for j in range(cols):
                    # The try catch clause is used, because elements such as rocks are not in the digraph
                    # and will throw a KeyError
                    try:
                        # Get the current cell's distance to the current map object element
                        if is_bottom:
                            cell_key = f"{i + 11},{j}"
                        else:
                            cell_key = f"{i},{j}"

                        # Calculate the reward value
                        cell_distance = distance_dict[cell_key][location]
                        reward_value = 1 - (cell_distance * element_step_value)
                        result_dict[cell_key] = reward_value
                    except KeyError:
                        pass
            # Add the dictionary to the reward matrix using the element as the key name
            self._reward_matrix[location] = result_dict

    def _get_state(self, location: list) -> int:
        """
        Performs a lookup using the location of the passed cell into the state table, and returns
        the state value as an integer. Used for returning observation information.
        :param location: numpy array of the cell location [row, col]
        :return: Integer representing the state number of the cell
        """
        state = self._state_array[location[0]][location[1]]
        return int(state)

    def _get_tank_reward(self) -> float:
        if self._target is not None:
            tank_name = f"{self._target[0]},{self._target[1]}"
            reward_dict = self._reward_matrix[tank_name]
            reward_to_cur_state = reward_dict[f"{self._agent_location[0]},{self._agent_location[1]}"]
        return reward_to_cur_state

    def render(self):
        """
        This function is used for rendering, and is currently not implemented
        :return: None
        """
        if self.render_mode == "console":
            pass

    def reset(self, seed=None, options=None) -> tuple:
        """
        Resets the game environment by initializing the player to their starting location,
        and seeding the random number generated (if needed)
        :param seed: the seed number for numpy random number generator
        :param options: Options passed to the gym environment
        :return:
        """
        # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # Assign the agent's location
        self.map_csv = np.genfromtxt(self._map_name, delimiter=",")
        self._agent_location = np.argwhere(self.map_csv == START)[0]
        self._agent_top = None
        self._target = None
        self._tanks_destroyed = []
        self._tank_list = []
        self.episode_length = 0  # Initialize episode length

        observation = self._get_obs()
        info = self._get_info()
        # with open('/content/trajectories.txt', 'a') as writefile:
        #     writefile.write(f"Reset; {observation}\n")

        return observation, info

    def step(self, action) -> tuple:

        assert action in [0, 1, 2, 3, 4], "Error! Action must be 0, 1, 2, 3, 4"
        reward = 0
        terminated = False
        self.episode_length += 1
        if self.episode_length > 20000:
            terminated = True

        # If the action is a movement action
        if action in [0, 1, 2, 3]:
            direction = self._action_to_direction[action]
            new_state = self._agent_location + direction

            # Make sure we're still on the map
            if (0 <= new_state[0] < self._num_rows) and (0 <= new_state[1] < self._num_cols):

                # If the location being moved to is not in the untraversable list
                if self.map_csv[new_state[0]][new_state[1]] not in self._untraversable:
                    self._agent_location += direction

                    # If this is the first step toward top or bottom, set the agent top variable
                    if self._agent_top is None:
                        if action == 1:
                            self._agent_top = True
                            self._tank_list.extend(self._tank_loc_top.tolist())
                            self._get_closest_objective()
                        elif action == 3:
                            self._agent_top = False
                            self._tank_list.extend(self._tank_loc_bottom.tolist())
                            self._get_closest_objective()

                    # Don't allow an exit until all tanks have been destroyed
                    if len(self._tanks_destroyed) == self._tank_count:

                        # An episode is done iff the agent has reached the target
                        if self._agent_top:
                            terminated = np.array_equal(self._agent_location, self._exit_loc_top[0])
                            if terminated:
                                print("Exit Reached")
                                reward = 1000
                        else:
                            terminated = np.array_equal(self._agent_location, self._exit_loc_bottom[0])
                            if terminated:
                                print("Exit Reached")
                                reward = 1000

            reward = self._get_reward()

        # If the action is a weapons fire
        elif action == 4:

            # Determine if there is a tank close by using a distance metric?
            if self._check_radar():
                print("Tank Destroyed")
                reward = 1000
                self._get_closest_objective()

        # Get the information to be returned by the step method for gym
        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, terminated, False, info



In [None]:
from locals import state_to_row_col
from locals import coord_to_state
from anch_human_demonstr import par1_cond1


def location_to_string(location: list):
    return f"{location[0]},{location[1]}"


class ExpertAnchorBaseline(FullAnchoringBaseline):
    """
    This class translates the actions undertaken by the human demonstrators, and returns the observation information
    """

    def __init__(self, map_name: str, expert_data: dict, max_episode_length: int=260 , max_episode_threshold: int=187,  gamma: float = .99, rho: float = .5):
        """

        :param map_name:
        :param target_list: A list, in order, of the targets that the human destroyed.  This is used to update the
                            target variable in the observation space, and for calculating the reward values.
        :param gamma:
        :param rho: Hyperparameter to offset the reward value if the agent is replicating the human expert behavior
        """
        super().__init__(map_name)
        self._expert_data = expert_data
        self._target_list = list(self._expert_data["targets"])
        self._target_list_coord = [state_to_row_col[state_code] for state_code in self._target_list]
        self._gamma = gamma
        self._rho = rho
        self._max_episode_length = max_episode_length
        self._max_episode_threshold = max_episode_threshold
        # Determine if the expert's targets were top or bottom of the map.
        if any(self._target_list_coord) in self._tank_loc_top:
            # If they are on top, then zero out the tank values on the bottom
            for tank in self._tank_loc_bottom:
                self.map_csv[tank[0]][tank[1]] = 0
        else:
            # If they are on the bottom, zero out the tanks on the top
            for tank in self._tank_loc_top:
                self.map_csv[tank[0]][tank[1]] = 0
        self._visited_states = []

    def _get_obs(self) -> dict:
        """
        Returns the observation information of the environment.  Converts the cell location to a state number for
        the agent, the tank locations, and the exit locations.  Returns these values as a dictionary.
        :return:
        """
        # Get the agent's state and the exit state
        agent_state = self._get_state(self._agent_location)
        if self._agent_top is None:
            exit_state = self._size  # Null value
            tank_states = [self._size for _ in range(self._tank_count)]
        elif self._agent_top:
            exit_state = self._get_state(self._exit_loc_top[0])
            tank_states = []
            for tank in self._tank_list:
                tank_states.append(self._get_state(tank))
        elif not self._agent_top:
            exit_state = self._get_state(self._exit_loc_bottom[0])
            tank_states = []
            for tank in self._tank_list:
                tank_states.append(self._get_state(tank))
        else:
            exit_state = self._size  # Null value
            tank_states = [self._size for _ in range(self._tank_count)]

        target_state = self._target_list[0]

        return np.array([int(agent_state), int(target_state), int(exit_state)], dtype=int)

    def step(self, action) -> tuple:

        assert action in [0, 1, 2, 3, 4], "Error! Action must be 0, 1, 2, 3, 4"
        reward = -.01 # 1 / max_episode_threshold
        terminated = False
        obs = list(self._get_obs())
        self.episode_length += 1
        if self.episode_length > self._max_episode_threshold:
            terminated = True
        state_location = coord_to_state[location_to_string(self._agent_location)]

        # If the action is a movement action
        if action in [0, 1, 2, 3]:
            direction = self._action_to_direction[action]

            new_state = self._agent_location + direction

            # Make sure we're still on the map
            if (0 <= new_state[0] < self._num_rows) and (0 <= new_state[1] < self._num_cols):

                # If the location being moved to is not in the untraversable list
                if self.map_csv[new_state[0]][new_state[1]] not in self._untraversable:

                    for items in self._expert_data["data"]:
                      if obs == items[0]:
                        if action == items[1]:
                              reward = 1
                              # Added F value code

                    self._agent_location += direction

                    # If this is the first step toward top or bottom, set the agent top variable
                    if self._agent_top is None:
                        if action == 1:
                            self._agent_top = True
                            self._tank_list.extend(self._tank_loc_top.tolist())
                            self._get_closest_objective()
                        elif action == 3:
                            self._agent_top = False
                            self._tank_list.extend(self._tank_loc_bottom.tolist())
                            self._get_closest_objective()

                    # Don't allow an exit until all tanks have been destroyed
                    if len(self._tanks_destroyed) == self._tank_count:

                        # An episode is done iff the agent has reached the target
                        if self._agent_top:
                            terminated = np.array_equal(self._agent_location, self._exit_loc_top[0])
                            if terminated:
                                print("Exit Reached")
                                reward = 1
                        else:
                            terminated = np.array_equal(self._agent_location, self._exit_loc_bottom[0])
                            if terminated:
                                print("Exit Reached")
                                reward = 1

        # If the action is a weapons fire
        elif action == 4:

            # Determine if there is a tank close by using a distance metric?
            # TODO: FIX THIS SO THERE IS NO REWARD FOR DESTROYING TANKS OFF MAP
            if self._check_radar():
                print("Tank Destroyed")
                reward = 1
                self._get_closest_objective()
                self._target_list.pop(0)

        # Get the information to be returned by the step method for gym
        observation = self._get_obs()
        info = self._get_info()
        # print(observation, action, reward)
        return observation, reward, terminated, False, info

    def reset(self, seed=None, options=None) -> tuple:
        """
        Resets the game environment by initializing the player to their starting location,
        and seeding the random number generated (if needed)
        :param seed: the seed number for numpy random number generator
        :param options: Options passed to the gym environment
        :return:
        """
        # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # Assign the agent's location
        self.map_csv = np.genfromtxt(self._map_name, delimiter=",")
        self._agent_location = np.argwhere(self.map_csv == START)[0]
        self._agent_top = None
        self._target = None
        self._tanks_destroyed = []
        self._tank_list = []
        self.episode_length = 0  # Initialize episode length
        self._target_list = list(self._expert_data["targets"])
        self._target_list_coord = [state_to_row_col[int(state_code)] for state_code in self._target_list]


        observation = self._get_obs()
        info = self._get_info()
        # with open('/content/trajectories.txt', 'a') as writefile:
        #     writefile.write(f"Reset; {observation}\n")

        return observation, info


In [None]:
import tempfile
import numpy as np
import gym
from stable_baselines3 import DQN
from nltk.translate import meteor_score
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn import MlpPolicy
from stable_baselines3.common.monitor import Monitor
from imitation.algorithms import bc
from imitation.data.types import Transitions
from imitation.data.types import Trajectory
from imitation.algorithms.dagger import SimpleDAggerTrainer
from anch_human_demonstr import par1_cond1
from anch_human_demonstr import par1_cond2
from anch_human_demonstr import par1_cond3
from anch_human_demonstr import par1_cond4

expert_data = np.load("/content/drive/MyDrive/human_dem_par1_cond4.npz", allow_pickle=True)

rng = np.random.default_rng(0)

def make_custom_env(map, expert):
    return ExpertAnchorBaseline(map, expert)


map = "/content/drive/MyDrive/anchoring_baseline_urban_top_10.csv"
condition = par1_cond4


env = make_custom_env(map, condition)
env = Monitor(env)




expert = DQN('MlpPolicy', env, verbose=1)
expert = expert.load("/content/drive/MyDrive/DQN_RS_participant1_condition4.zip")

venv = DummyVecEnv([lambda: make_custom_env(map, condition)])

# Build a list of the 'wordings' from the set of expert trajectories
# a wording consists of "obs, action".  These are added as strings
# since the METEOR method requires strings.  Since we are iterating through
# all observations too, we can simultaneously build the expert
# trajectory object to be passed to dagger.
expert_trajectory_list = []
expert_trajectory = []
trajectory_list = []
traj_obs = []
traj_acts = []
traj_infos = []

for i in range(len(expert_data["obs"])):
    # Build the word for the Meteor score and add it to the
    # expert trajectory list
    word = str(expert_data["obs"][i]) + "," + str(expert_data["acts"][i])
    expert_trajectory.append(word)

    # Add the observation, action, and infos to the list for the
    # transition object obe create
    traj_obs.append(expert_data["obs"][i])
    if not expert_data["dones"][i]:
        traj_acts.append(expert_data["acts"][i])
        traj_infos.append(expert_data["infos"][i])

    else:
        # Append the expert trajectory to the list, and zero it out
        expert_trajectory_list.append(expert_trajectory)
        expert_trajectory = []

        # Create the Trajectory object, and add it to the trajectory
        # list that will be passed to dagger.  Zero out lists after
        trajectory_one = Trajectory(
            obs=traj_obs,
            acts=traj_acts,
            infos=traj_infos,
            terminal=True
        )
        trajectory_list.append(trajectory_one)
        traj_obs = []
        traj_acts = []
        traj_infos = []

transitions = Transitions(
    obs=expert_data['obs'],
    acts=expert_data['acts'],
    next_obs=expert_data['next_obs'],
    dones=expert_data['dones'],
    infos=expert_data['infos']
)

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    rng=rng,
    #demonstrations=transitions
)

with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
    print(tmpdir)
    dagger_trainer = SimpleDAggerTrainer(
        venv=venv,
        scratch_dir=tmpdir,
        expert_policy=expert,
        bc_trainer=bc_trainer,
        rng=rng,
        #expert_trajs=trajectory_list,
    )
    dagger_trainer.allow_variable_horizon = True
    dagger_trainer.train(1000, bc_train_kwargs={"n_epochs": 200})

nenv = make_custom_env(map, condition)
nenv = Monitor(nenv)

agent_trajectory = []
meteor_scores = []
for i in range(1000):
    obs = nenv.reset()
    obs = obs[0]
    total_reward = 0
    while True:
        action = dagger_trainer.policy.predict(obs)
        action = int(action[0])
        word = str(obs) + "," + str(action)
        agent_trajectory.append(word)
        obs, reward, terminated, truncated, info = nenv.step(action)
        total_reward += reward

        if terminated:
            print(f"{total_reward}", end="")
            for trajs in expert_trajectory_list:
                score = meteor_score.single_meteor_score(trajs, agent_trajectory)
                meteor_scores.append(score)
                print(f", {score}", end="")
            print()
            agent_trajectory = []
            break
print("METEOR SCORES")
print(meteor_scores)





  and should_run_async(code)


Using cpu device
Wrapping the env in a DummyVecEnv.
/tmp/dagger_example_h256teta
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00161 |
|    entropy        | 1.61     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 129      |
|    loss           | 1.61     |
|    neglogp        | 1.61     |
|    prob_true_act  | 0.2      |
|    samples_so_far | 32       |
| rollout/          |          |
|    return_max     | 9.23     |
|    return_mean    | 1.96     |
|    return_min     | -1.88    |
|    return_std     | 4.8      |
--------------------------------


17batch [00:00, 24.12batch/s]
30batch [00:01, 42.23batch/s]
45batch [00:01, 62.80batch/s]
58batch [00:01, 77.15batch/s]
71batch [00:01, 88.45batch/s]
101batch [00:01, 115.00batch/s]
116batch [00:01, 122.87batch/s]
131batch [00:01, 128.47batch/s]
145batch [00:01, 128.08batch/s]
159batch [00:02, 123.36batch/s]
172batch [00:02, 123.52batch/s]
202batch [00:02, 131.37batch/s]
217batch [00:02, 135.49batch/s]
232batch [00:02, 138.18batch/s]
246batch [00:02, 137.80batch/s]
261batch [00:02, 139.47batch/s]
276batch [00:02, 138.93batch/s]
306batch [00:03, 142.68batch/s]
321batch [00:03, 134.17batch/s]
335batch [00:03, 128.47batch/s]
348batch [00:03, 126.05batch/s]
363batch [00:03, 132.25batch/s]
391batch [00:03, 132.10batch/s]
405batch [00:03, 126.19batch/s]
418batch [00:03, 121.90batch/s]
431batch [00:04, 116.44batch/s]
445batch [00:04, 121.79batch/s]
474batch [00:04, 127.94batch/s]
488batch [00:04, 130.92batch/s]
Epoch 28 of 200                [A

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000203 |
|    entropy        | 0.203     |
|    epoch          | 29        |
|    l2_loss        | 0         |
|    l2_norm        | 218       |
|    loss           | 0.0855    |
|    neglogp        | 0.0857    |
|    prob_true_act  | 0.932     |
|    samples_so_far | 16032     |
| rollout/          |           |
|    return_max     | 90.9      |
|    return_mean    | 56.1      |
|    return_min     | 11.3      |
|    return_std     | 28.5      |
---------------------------------


502batch [00:05, 41.35batch/s]
516batch [00:05, 52.28batch/s]
530batch [00:05, 63.82batch/s]
560batch [00:05, 90.16batch/s]
574batch [00:05, 99.78batch/s]
588batch [00:06, 108.50batch/s]
602batch [00:06, 115.61batch/s]
616batch [00:06, 121.82batch/s]
644batch [00:06, 126.79batch/s]
659batch [00:06, 131.23batch/s]
674batch [00:06, 135.72batch/s]
688batch [00:06, 136.32batch/s]
702batch [00:06, 136.86batch/s]
717batch [00:06, 139.06batch/s]
748batch [00:07, 143.97batch/s]
763batch [00:07, 141.76batch/s]
778batch [00:07, 135.76batch/s]
792batch [00:07, 132.83batch/s]
806batch [00:07, 132.91batch/s]
820batch [00:07, 132.88batch/s]
849batch [00:07, 137.82batch/s]
863batch [00:08, 136.09batch/s]
877batch [00:08, 132.81batch/s]
891batch [00:08, 128.05batch/s]
917batch [00:08, 128.02batch/s]
931batch [00:08, 128.92batch/s]
944batch [00:08, 117.06batch/s]
968batch [00:08, 104.84batch/s]
979batch [00:09, 101.08batch/s]
1000batch [00:09, 94.15batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -4.56e-05 |
|    entropy        | 0.0456    |
|    epoch          | 58        |
|    l2_loss        | 0         |
|    l2_norm        | 251       |
|    loss           | 0.00829   |
|    neglogp        | 0.00833   |
|    prob_true_act  | 0.992     |
|    samples_so_far | 32032     |
| rollout/          |           |
|    return_max     | 95        |
|    return_mean    | 81.9      |
|    return_min     | 58.7      |
|    return_std     | 15.5      |
---------------------------------



1010batch [00:10, 28.06batch/s]
1032batch [00:10, 44.44batch/s]
1053batch [00:10, 61.78batch/s]
1063batch [00:10, 69.02batch/s]
1084batch [00:11, 82.32batch/s]
1104batch [00:11, 89.04batch/s]
1114batch [00:11, 89.88batch/s]
1134batch [00:11, 85.97batch/s]
1152batch [00:11, 82.66batch/s]
1170batch [00:12, 82.55batch/s]
1190batch [00:12, 89.90batch/s]
1200batch [00:12, 88.93batch/s]
1221batch [00:12, 93.39batch/s]
1231batch [00:12, 93.02batch/s]
1252batch [00:12, 94.62batch/s]
1272batch [00:13, 89.73batch/s]
1292batch [00:13, 87.89batch/s]
1301batch [00:13, 85.86batch/s]
1320batch [00:13, 87.12batch/s]
1339batch [00:13, 86.88batch/s]
1358batch [00:14, 88.63batch/s]
1368batch [00:14, 91.76batch/s]
1382batch [00:14, 104.93batch/s]
1396batch [00:14, 114.29batch/s]
1427batch [00:14, 129.06batch/s]
1441batch [00:14, 132.17batch/s]
1455batch [00:14, 134.15batch/s]
1469batch [00:15, 133.10batch/s]
1483batch [00:15, 134.22batch/s]
1497batch [00:15, 134.06batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -2.61e-05 |
|    entropy        | 0.0261    |
|    epoch          | 88        |
|    l2_loss        | 0         |
|    l2_norm        | 269       |
|    loss           | 0.00392   |
|    neglogp        | 0.00394   |
|    prob_true_act  | 0.996     |
|    samples_so_far | 48032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

1511batch [00:15, 55.51batch/s] 
1526batch [00:15, 68.99batch/s]
1541batch [00:16, 81.95batch/s]
1555batch [00:16, 92.52batch/s]
1569batch [00:16, 102.55batch/s]
1597batch [00:16, 113.30batch/s]
1610batch [00:16, 113.98batch/s]
1623batch [00:16, 115.37batch/s]
1636batch [00:16, 118.86batch/s]
1651batch [00:16, 125.64batch/s]
1683batch [00:17, 136.46batch/s]
1697batch [00:17, 132.34batch/s]
1711batch [00:17, 131.50batch/s]
1725batch [00:17, 132.91batch/s]
1739batch [00:17, 133.69batch/s]
1753batch [00:17, 134.67batch/s]
1783batch [00:17, 135.82batch/s]
1797batch [00:17, 136.39batch/s]
1812batch [00:18, 137.99batch/s]
1827batch [00:18, 139.40batch/s]
1841batch [00:18, 137.66batch/s]
1855batch [00:18, 135.76batch/s]
1871batch [00:18, 137.09batch/s]
1903batch [00:18, 139.84batch/s]
1917batch [00:18, 136.03batch/s]
1931batch [00:18, 136.50batch/s]
1945batch [00:19, 136.62batch/s]
1959batch [00:19, 133.91batch/s]
1989batch [00:19, 138.79batch/s]
Epoch 116 of 200                [A

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 2000      |
|    ent_loss       | -1.19e-05 |
|    entropy        | 0.0119    |
|    epoch          | 117       |
|    l2_loss        | 0         |
|    l2_norm        | 280       |
|    loss           | 0.00156   |
|    neglogp        | 0.00157   |
|    prob_true_act  | 0.998     |
|    samples_so_far | 64032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

2003batch [00:19, 58.98batch/s]
2017batch [00:20, 70.46batch/s]
2030batch [00:20, 80.77batch/s]
2044batch [00:20, 92.25batch/s]
2071batch [00:20, 108.39batch/s]
2086batch [00:20, 117.88batch/s]
2101batch [00:20, 125.65batch/s]
2115batch [00:20, 128.45batch/s]
2129batch [00:20, 130.84batch/s]
2159batch [00:21, 136.94batch/s]
2174batch [00:21, 128.37batch/s]
2188batch [00:21, 129.88batch/s]
2202batch [00:21, 130.82batch/s]
2217batch [00:21, 135.52batch/s]
2231batch [00:21, 135.21batch/s]
2261batch [00:21, 141.94batch/s]
2276batch [00:21, 139.07batch/s]
2290batch [00:22, 138.84batch/s]
2304batch [00:22, 138.19batch/s]
2319batch [00:22, 139.61batch/s]
2333batch [00:22, 139.66batch/s]
2347batch [00:22, 137.70batch/s]
2380batch [00:22, 145.94batch/s]
2395batch [00:22, 144.99batch/s]
2410batch [00:22, 142.79batch/s]
2425batch [00:22, 139.41batch/s]
2439batch [00:23, 137.79batch/s]
2454batch [00:23, 139.16batch/s]
2482batch [00:23, 137.34batch/s]
2496batch [00:23, 134.49batch/s]
Epoch 146 of 2

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 2500      |
|    ent_loss       | -8.37e-06 |
|    entropy        | 0.00837   |
|    epoch          | 147       |
|    l2_loss        | 0         |
|    l2_norm        | 289       |
|    loss           | 0.00104   |
|    neglogp        | 0.00105   |
|    prob_true_act  | 0.999     |
|    samples_so_far | 80032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

2510batch [00:24, 57.01batch/s]
2524batch [00:24, 69.11batch/s]
2548batch [00:24, 81.24batch/s]
2559batch [00:24, 82.84batch/s]
2580batch [00:24, 87.39batch/s]
2600batch [00:25, 86.73batch/s]
2610batch [00:25, 84.16batch/s]
2630batch [00:25, 89.51batch/s]
2651batch [00:25, 94.31batch/s]
2661batch [00:25, 94.98batch/s]
2682batch [00:25, 96.24batch/s]
2702batch [00:26, 93.91batch/s]
2712batch [00:26, 94.31batch/s]
2732batch [00:26, 91.81batch/s]
2752batch [00:26, 91.22batch/s]
2762batch [00:26, 88.53batch/s]
2783batch [00:26, 92.46batch/s]
2803batch [00:27, 90.36batch/s]
2813batch [00:27, 84.98batch/s]
2833batch [00:27, 89.85batch/s]
2854batch [00:27, 93.24batch/s]
2864batch [00:27, 92.24batch/s]
2884batch [00:28, 90.32batch/s]
2903batch [00:28, 86.64batch/s]
2922batch [00:28, 87.03batch/s]
2941batch [00:28, 88.55batch/s]
2950batch [00:28, 87.16batch/s]
2968batch [00:29, 86.60batch/s]
2987batch [00:29, 88.38batch/s]
2996batch [00:29, 85.15batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 3000      |
|    ent_loss       | -9.41e-06 |
|    entropy        | 0.00941   |
|    epoch          | 176       |
|    l2_loss        | 0         |
|    l2_norm        | 296       |
|    loss           | 0.00116   |
|    neglogp        | 0.00117   |
|    prob_true_act  | 0.999     |
|    samples_so_far | 96032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

3005batch [00:30, 32.46batch/s]
3017batch [00:30, 43.86batch/s]
3031batch [00:30, 58.59batch/s]
3045batch [00:30, 72.77batch/s]
3061batch [00:30, 88.04batch/s]
3094batch [00:30, 116.07batch/s]
3109batch [00:30, 122.98batch/s]
3123batch [00:30, 124.49batch/s]
3137batch [00:31, 128.41batch/s]
3151batch [00:31, 127.01batch/s]
3165batch [00:31, 125.80batch/s]
3195batch [00:31, 132.04batch/s]
3209batch [00:31, 132.76batch/s]
3223batch [00:31, 133.14batch/s]
3238batch [00:31, 136.02batch/s]
3253batch [00:31, 136.74batch/s]
3267batch [00:32, 136.10batch/s]
3296batch [00:32, 134.69batch/s]
3310batch [00:32, 132.85batch/s]
3324batch [00:32, 134.38batch/s]
3338batch [00:32, 131.38batch/s]
3352batch [00:32, 129.96batch/s]
3383batch [00:32, 137.53batch/s]
3397batch [00:33, 136.94batch/s]
3400batch [00:33, 102.87batch/s]


Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

0batch [00:00, ?batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 0         |
|    ent_loss       | -7.27e-06 |
|    entropy        | 0.00727   |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 301       |
|    loss           | 0.00087   |
|    neglogp        | 0.000877  |
|    prob_true_act  | 0.999     |
|    samples_so_far | 32        |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

32batch [00:00, 65.75batch/s]
63batch [00:00, 105.56batch/s]
93batch [00:01, 126.13batch/s]
138batch [00:01, 130.64batch/s]
168batch [00:01, 137.51batch/s]
200batch [00:01, 145.23batch/s]
231batch [00:02, 146.99batch/s]
276batch [00:02, 140.44batch/s]
307batch [00:02, 141.78batch/s]
338batch [00:02, 143.38batch/s]
383batch [00:03, 137.57batch/s]
411batch [00:03, 136.46batch/s]
440batch [00:03, 140.57batch/s]
490batch [00:03, 152.57batch/s]
Epoch 13 of 200                [A

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -3.42e-06 |
|    entropy        | 0.00342   |
|    epoch          | 14        |
|    l2_loss        | 0         |
|    l2_norm        | 307       |
|    loss           | 0.000366  |
|    neglogp        | 0.00037   |
|    prob_true_act  | 1         |
|    samples_so_far | 16032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

523batch [00:04, 78.78batch/s]
554batch [00:04, 103.30batch/s]
585batch [00:05, 122.03batch/s]
630batch [00:05, 130.79batch/s]
659batch [00:05, 133.38batch/s]
689batch [00:05, 139.47batch/s]
732batch [00:06, 123.06batch/s]
768batch [00:06, 101.44batch/s]
801batch [00:06, 100.49batch/s]
832batch [00:07, 90.30batch/s]
872batch [00:07, 92.47batch/s]
902batch [00:07, 93.97batch/s]
943batch [00:08, 90.64batch/s]
975batch [00:08, 96.66batch/s]
995batch [00:08, 96.15batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -3.19e-06 |
|    entropy        | 0.00319   |
|    epoch          | 28        |
|    l2_loss        | 0         |
|    l2_norm        | 312       |
|    loss           | 0.000346  |
|    neglogp        | 0.000349  |
|    prob_true_act  | 1         |
|    samples_so_far | 32032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

1005batch [00:09, 34.08batch/s]
1046batch [00:10, 67.36batch/s]
1076batch [00:10, 82.11batch/s]
1113batch [00:10, 77.99batch/s]
1149batch [00:11, 84.64batch/s]
1190batch [00:11, 96.55batch/s]
1220batch [00:11, 121.88batch/s]
1249batch [00:12, 131.16batch/s]
1295batch [00:12, 144.24batch/s]
1325batch [00:12, 141.29batch/s]
1356batch [00:12, 142.58batch/s]
1400batch [00:13, 135.31batch/s]
1430batch [00:13, 141.75batch/s]
1460batch [00:13, 142.67batch/s]
1491batch [00:13, 141.64batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -2.22e-06 |
|    entropy        | 0.00222   |
|    epoch          | 42        |
|    l2_loss        | 0         |
|    l2_norm        | 317       |
|    loss           | 0.000229  |
|    neglogp        | 0.000231  |
|    prob_true_act  | 1         |
|    samples_so_far | 48032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 


1538batch [00:14, 87.43batch/s]
1568batch [00:14, 107.69batch/s]
1598batch [00:15, 124.49batch/s]
1644batch [00:15, 138.83batch/s]
1675batch [00:15, 145.02batch/s]
1705batch [00:15, 144.74batch/s]
1735batch [00:16, 145.03batch/s]
1782batch [00:16, 146.11batch/s]
1813batch [00:16, 147.89batch/s]
1843batch [00:16, 143.32batch/s]
1889batch [00:17, 145.08batch/s]
1919batch [00:17, 141.10batch/s]
1950batch [00:17, 144.46batch/s]
1981batch [00:17, 145.26batch/s]
1996batch [00:17, 136.65batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 2000      |
|    ent_loss       | -1.52e-06 |
|    entropy        | 0.00152   |
|    epoch          | 57        |
|    l2_loss        | 0         |
|    l2_norm        | 321       |
|    loss           | 0.000145  |
|    neglogp        | 0.000147  |
|    prob_true_act  | 1         |
|    samples_so_far | 64032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

2026batch [00:18, 74.42batch/s]
2056batch [00:18, 100.07batch/s]
2086batch [00:19, 117.54batch/s]
2132batch [00:19, 135.23batch/s]
2163batch [00:19, 140.89batch/s]
2195batch [00:19, 144.27batch/s]
2225batch [00:20, 142.08batch/s]
2272batch [00:20, 145.14batch/s]
2302batch [00:20, 141.68batch/s]
2333batch [00:20, 143.11batch/s]
2379batch [00:21, 141.89batch/s]
2409batch [00:21, 138.67batch/s]
2438batch [00:21, 140.05batch/s]
2484batch [00:21, 128.64batch/s]
2498batch [00:22, 110.80batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 2500      |
|    ent_loss       | -1.52e-06 |
|    entropy        | 0.00152   |
|    epoch          | 71        |
|    l2_loss        | 0         |
|    l2_norm        | 326       |
|    loss           | 0.000149  |
|    neglogp        | 0.000151  |
|    prob_true_act  | 1         |
|    samples_so_far | 80032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

2520batch [00:22, 48.48batch/s]
2548batch [00:23, 64.69batch/s]
2587batch [00:23, 84.59batch/s]
2618batch [00:24, 92.03batch/s]
2659batch [00:24, 92.61batch/s]
2689batch [00:24, 92.18batch/s]
2727batch [00:25, 85.12batch/s]
2757batch [00:25, 90.36batch/s]
2797batch [00:26, 89.22batch/s]
2834batch [00:26, 86.81batch/s]
2862batch [00:26, 87.19batch/s]
2900batch [00:27, 88.55batch/s]
2928batch [00:27, 114.04batch/s]
2973batch [00:27, 135.32batch/s]
2987batch [00:27, 135.43batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 3000      |
|    ent_loss       | -7.99e-07 |
|    entropy        | 0.000799  |
|    epoch          | 85        |
|    l2_loss        | 0         |
|    l2_norm        | 330       |
|    loss           | 7.42e-05  |
|    neglogp        | 7.5e-05   |
|    prob_true_act  | 1         |
|    samples_so_far | 96032     |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

3001batch [00:28, 57.79batch/s] 
3031batch [00:28, 84.64batch/s]
3077batch [00:29, 118.24batch/s]
3107batch [00:29, 131.92batch/s]
3137batch [00:29, 135.20batch/s]
3184batch [00:29, 146.41batch/s]
3214batch [00:29, 145.07batch/s]
3244batch [00:30, 145.05batch/s]
3289batch [00:30, 140.79batch/s]
3319batch [00:30, 141.72batch/s]
3350batch [00:30, 143.83batch/s]
3380batch [00:31, 143.01batch/s]
3426batch [00:31, 141.84batch/s]
3457batch [00:31, 144.24batch/s]
3488batch [00:31, 145.88batch/s]
Epoch 99 of 200                 [A

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 3500      |
|    ent_loss       | -5.48e-07 |
|    entropy        | 0.000548  |
|    epoch          | 100       |
|    l2_loss        | 0         |
|    l2_norm        | 334       |
|    loss           | 4.69e-05  |
|    neglogp        | 4.74e-05  |
|    prob_true_act  | 1         |
|    samples_so_far | 112032    |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

3534batch [00:32, 85.53batch/s]
3564batch [00:32, 108.37batch/s]
3594batch [00:33, 123.21batch/s]
3638batch [00:33, 131.40batch/s]
3668batch [00:33, 134.79batch/s]
3698batch [00:33, 139.13batch/s]
3745batch [00:34, 146.25batch/s]
3775batch [00:34, 143.09batch/s]
3806batch [00:34, 142.25batch/s]
3837batch [00:34, 142.87batch/s]
3883batch [00:35, 147.19batch/s]
3913batch [00:35, 145.00batch/s]
3943batch [00:35, 144.07batch/s]
3990batch [00:35, 148.39batch/s]
Epoch 113 of 200                [A

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 4000     |
|    ent_loss       | -3.4e-07 |
|    entropy        | 0.00034  |
|    epoch          | 114      |
|    l2_loss        | 0        |
|    l2_norm        | 338      |
|    loss           | 2.77e-05 |
|    neglogp        | 2.8e-05  |
|    prob_true_act  | 1        |
|    samples_so_far | 128032   |
| rollout/          |          |
|    return_max     | 94       |
|    return_mean    | 94       |
|    return_min     | 94       |

4019batch [00:36, 72.20batch/s]
4049batch [00:36, 97.09batch/s]
4093batch [00:37, 116.42batch/s]
4121batch [00:37, 110.09batch/s]
4156batch [00:37, 101.78batch/s]
4200batch [00:38, 100.11batch/s]
4231batch [00:38, 94.21batch/s]
4262batch [00:38, 94.91batch/s]
4302batch [00:39, 95.27batch/s]
4332batch [00:39, 91.74batch/s]
4373batch [00:40, 96.22batch/s]
4403batch [00:40, 93.11batch/s]
4444batch [00:40, 95.53batch/s]
4474batch [00:41, 94.28batch/s]
4494batch [00:41, 92.23batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 4500      |
|    ent_loss       | -5.91e-07 |
|    entropy        | 0.000591  |
|    epoch          | 128       |
|    l2_loss        | 0         |
|    l2_norm        | 342       |
|    loss           | 5.41e-05  |
|    neglogp        | 5.47e-05  |
|    prob_true_act  | 1         |
|    samples_so_far | 144032    |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

4514batch [00:42, 39.73batch/s]
4543batch [00:42, 63.39batch/s]
4573batch [00:42, 84.36batch/s]
4614batch [00:43, 115.04batch/s]
4642batch [00:43, 123.83batch/s]
4689batch [00:43, 141.88batch/s]
4720batch [00:43, 141.97batch/s]
4752batch [00:44, 146.94batch/s]
4782batch [00:44, 140.93batch/s]
4829batch [00:44, 148.61batch/s]
4860batch [00:44, 143.08batch/s]
4890batch [00:45, 141.03batch/s]
4935batch [00:45, 141.73batch/s]
4966batch [00:45, 140.89batch/s]
4995batch [00:45, 134.45batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 5000      |
|    ent_loss       | -3.04e-07 |
|    entropy        | 0.000304  |
|    epoch          | 142       |
|    l2_loss        | 0         |
|    l2_norm        | 346       |
|    loss           | 2.44e-05  |
|    neglogp        | 2.47e-05  |
|    prob_true_act  | 1         |
|    samples_so_far | 160032    |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 


5025batch [00:46, 72.49batch/s]
5072batch [00:46, 107.84batch/s]
5103batch [00:47, 126.16batch/s]
5132batch [00:47, 129.00batch/s]
5176batch [00:47, 137.82batch/s]
5205batch [00:47, 136.99batch/s]
5246batch [00:48, 125.26batch/s]
5275batch [00:48, 134.63batch/s]
5320batch [00:48, 144.43batch/s]
5349batch [00:49, 138.59batch/s]
5386batch [00:49, 69.32batch/s]
5425batch [00:50, 100.41batch/s]
5455batch [00:50, 122.01batch/s]
5485batch [00:50, 132.65batch/s]
5499batch [00:50, 131.10batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 5500      |
|    ent_loss       | -2.08e-07 |
|    entropy        | 0.000208  |
|    epoch          | 157       |
|    l2_loss        | 0         |
|    l2_norm        | 350       |
|    loss           | 1.61e-05  |
|    neglogp        | 1.63e-05  |
|    prob_true_act  | 1         |
|    samples_so_far | 176032    |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

5529batch [00:51, 71.51batch/s]
5559batch [00:51, 96.40batch/s]
5590batch [00:51, 118.89batch/s]
5635batch [00:51, 132.61batch/s]
5666batch [00:52, 137.51batch/s]
5696batch [00:52, 139.48batch/s]
5726batch [00:52, 142.45batch/s]
5771batch [00:52, 133.51batch/s]
5808batch [00:53, 100.18batch/s]
5841batch [00:53, 100.78batch/s]
5873batch [00:54, 95.84batch/s]
5914batch [00:54, 95.40batch/s]
5944batch [00:54, 97.20batch/s]
5985batch [00:55, 95.65batch/s]
5995batch [00:55, 91.06batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 6000     |
|    ent_loss       | -2e-07   |
|    entropy        | 0.0002   |
|    epoch          | 171      |
|    l2_loss        | 0        |
|    l2_norm        | 354      |
|    loss           | 1.63e-05 |
|    neglogp        | 1.65e-05 |
|    prob_true_act  | 1        |
|    samples_so_far | 192032   |
| rollout/          |          |
|    return_max     | 94       |
|    return_mean    | 94       |
|    return_min     | 94       |

6016batch [00:56, 42.95batch/s]
6045batch [00:56, 66.38batch/s]
6086batch [00:56, 86.10batch/s]
6123batch [00:57, 82.25batch/s]
6152batch [00:57, 86.59batch/s]
6191batch [00:58, 90.70batch/s]
6230batch [00:58, 90.11batch/s]
6259batch [00:58, 112.49batch/s]
6289batch [00:59, 128.75batch/s]
6330batch [00:59, 127.10batch/s]
6360batch [00:59, 135.02batch/s]
6391batch [00:59, 143.16batch/s]
6439batch [01:00, 151.84batch/s]
6471batch [01:00, 150.71batch/s]
6487batch [01:00, 141.46batch/s]

Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 6500      |
|    ent_loss       | -1.61e-07 |
|    entropy        | 0.000161  |
|    epoch          | 185       |
|    l2_loss        | 0         |
|    l2_norm        | 357       |
|    loss           | 1.28e-05  |
|    neglogp        | 1.3e-05   |
|    prob_true_act  | 1         |
|    samples_so_far | 208032    |
| rollout/          |           |
|    return_max     | 94        |
|    return_mean    | 94        |
|    return_min 

6502batch [01:01, 63.13batch/s] 
6531batch [01:01, 87.93batch/s]
6575batch [01:01, 116.37batch/s]
6605batch [01:01, 129.97batch/s]
6650batch [01:02, 138.88batch/s]
6680batch [01:02, 139.93batch/s]
6710batch [01:02, 139.10batch/s]
6740batch [01:02, 140.74batch/s]
6787batch [01:03, 145.91batch/s]
6818batch [01:03, 144.53batch/s]
6849batch [01:03, 145.15batch/s]
6880batch [01:03, 143.11batch/s]
6929batch [01:04, 149.08batch/s]
6960batch [01:04, 148.14batch/s]
6990batch [01:04, 145.30batch/s]
7000batch [01:04, 108.50batch/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Destroyed
Exit Reached
93.99000000000001, 0.8273606030480566
Tank Destroyed
Tank Destroyed
Tank Destroyed
Tank Des