*************** This is the best working model so far ***********

### Muti-agent reinforcement learning for Precision Agriculture

In [3]:
# Cell 1
# Import libraries
import gymnasium as gym
import random
import numpy as np
import pygame
from gymnasium import spaces
from shapely import Polygon
from shapely.geometry import Point
import yaml
np.random.seed(33) # seeding

In [18]:
# Cell 2 
# Read the experiments file and select the experiment
experiment_set_path = r'experiments/set1.yaml'
with open(experiment_set_path, 'r') as experiment_file:
    config = yaml.load(experiment_file, Loader=yaml.FullLoader)
    config['field'] = list(map(lambda x: tuple(x), config['field']))
    config['init_positions'] = list(map(lambda x: np.array(x), config['init_positions']))
    config['infected_locations'] = list(map(lambda x: tuple(x), config['infected_locations']))
selected_experiment = config # Select the experiment set
selected_experiment

{'field': [(43, 24), (20, 47), (13, 34), (12, 9), (22, 0)],
 'init_positions': [array([14, 24]), array([40, 25]), array([20, 40])],
 'infected_locations': [(15, 10),
  (22, 13),
  (25, 20),
  (26, 21),
  (35, 25),
  (25, 35)]}

Some helpful functions:

In [9]:
# Cell 3
# function for random starting points
# unused
def random_starting_locations(points, infected_locations):
    points = set([tuple(p) for p in points])
    st_locs = points.difference(infected_locations) # for points - infected_locations
    st_locs = random.choices(np.array(list(st_locs)),k=3)
    return st_locs

# convert binary list to decimal
def binary_list_to_decimal(bin_list):
    bin = ''
    for b in bin_list:
        bin += str(b)
    dec = int(bin,2)
    return dec

Make the multi-agent class:

In [10]:
# Cell 4
class ThreeAgentGridworldEnv(gym.Env):
    metadata = {'render_modes': ['human', 'print', 'rgb_array'], "render_fps": 4}    
    def __init__(self, render_mode=None, grid_size=(50, 50), poly_vertices=selected_experiment['field']):
        super(ThreeAgentGridworldEnv, self).__init__()
        self.Poly = Polygon(poly_vertices) # Get the points of the polygon
        self.poly_vertices = poly_vertices # Vertices of polygon
        self.size = grid_size[0]  # The size of the square grid
        self.window_size = 800  # The size of the PyGame window        
        self.grid_size = grid_size # Size of the grid (May need to remove later on)
        self.outer_boundary = self.Poly.buffer(distance=2) # outer boundary with buffer of distance 2

        # Observation points
        self.observation_points = self.obs_points()
        # print('Observation points:', self.observation_points)
        self.observation_length = len(self.observation_points)
        self.observation_map = {tuple(v):i for i,v in enumerate(self.observation_points)}
        # for p in self.observation_points:
        #     print('observation map:', p, self.observation_map[tuple(p)])

        # Keep track of visited states and count steps
        self.step_count = 0
        self.visited = set()
        self.infected_locations = selected_experiment['infected_locations']
        
        self.infected_length = len(self.infected_locations)
        self.infected_state_length = 2**(len(self.infected_locations)) # 2**5, binary to decimal
        self.infected_dict = {v:0 for v in self.infected_locations} # dictionary of locations
        
        # Action and observation space
        self.action_space = spaces.MultiDiscrete([5, 5, 5])  # 4 possible actions for each of the two agents
        # self.observation_positions = spaces.MultiDiscrete([self.observation_length, self.observation_length, self.observation_length])
        # self.infected_space = spaces.MultiBinary(5)
        # self.observation_space = spaces.Tuple((self.observation_positions, self.infected_space))
        self.observation_space = spaces.MultiDiscrete([self.observation_length, self.observation_length, self.observation_length, self.infected_state_length])
        
        assert render_mode is None or render_mode in self.metadata["render_modes"] # Check if the render mode is correct
        self.render_mode = render_mode
        # If human-rendering is used, `self.window` will be a reference
        # to the window that we draw to. `self.clock` will be a clock that is used
        # to ensure that the environment is rendered at the correct framerate in
        # human-mode. They will remain `None` until human-mode is used for the
        # first time.
        self.window = None
        self.clock = None

        # Reset the environment and start
        self.reset()

    def obs_points(self):
        xp,yp = self.Poly.exterior.xy
        # Gridpoints
        xs = np.arange(0, 90, 1)
        ys = np.arange(0, 90, 1)
        # Inside points
        # print(xs,ys)
        xps, yps = [], []
        for xi in xs:
            for yi in ys:
                p = Point(xi,yi)
                if self.Poly.contains(p):
                    xps += [p.x]
                    yps += [p.y]
        xps += xp
        yps += yp
        # plt.plot(xp,yp)
        # plt.scatter(xps,yps, color='r')
        obs_points = np.array(list(set(zip(xps,yps)))) # Taking unique observation points
        return obs_points
    
    def _get_obs(self):
        a1, a2, a3 = self.agent_positions[0], self.agent_positions[1], self.agent_positions[2]
        info = {'agent1': a1, 'agent2': a2, 'agent3': a3, 'step_count': self.step_count}
        # print('agent positions:', a1, a2, a3)
        p1,p2,p3 = self.observation_map[tuple(a1)], self.observation_map[tuple(a2)], self.observation_map[tuple(a3)]
        infected = binary_list_to_decimal(list(self.infected_dict.values()))
        state = np.array([p1,p2,p3,infected]) # convert the infected binary list to decimal
        return state, info

    def reset(self, seed=None, options={}):
        self.visited = set()
        self.step_count = 0
        self.infected_locations = selected_experiment['infected_locations']
        self.infected_dict = {v:0 for v in self.infected_locations} # dictionary of locations
        self.agent_positions = selected_experiment['init_positions']
        # self.agent_positions = [
        #     np.array([14, 34]),  # Agent 1 
        #     # np.array([self.grid_size[0]-1, self.grid_size[1]-1]),  # Agent 2 starts at bottom-right corner,
        #     # np.array([self.grid_size[0]-3, self.grid_size[1]-3])  # Agent 3 starts at specific position,
        #     np.array([80/2, 50/2]),  # Agent 2 
        #     np.array([40/2, 80/2]),  # Agent 3
        # ]
        # self.agent_positions = random_starting_locations(self.observation_points, self.infected_locations)
        return self._get_obs()

    def step(self, action):
        # Placeholder for terminal state and rewards
        terminated, truncated = False, False
        rewards = 0
        self.step_count += 1

        # Define the movements corresponding to each action
        movements = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)]  # up, down, left, right, none
        
        # Update the positions of both agents
        for i, act in enumerate(action):
            # if act == 4: # If the action is none
            #     terminated = True # Terminated
            #     break

            movement = movements[act] # What movement to take
            new_position = self.agent_positions[i] + movement # New position after movement
            
            # Ensure the new position is within bounds
            # new_position = np.clip(new_position, [0, 0], [self.grid_size[0]-1, self.grid_size[1]-1])
            new_p = Point(new_position[0], new_position[1])
            # print("checking new position", tuple(new_position))
            if self.Poly.contains(new_p):
                self.agent_positions[i] = new_position
            else:
                rewards -= 10
            if tuple(new_position) in self.visited:
                rewards -= 10
            else:
                rewards -= 1
            self.visited.add(tuple(new_position))
        
        # Check if an infected location is visited
        # JW: Should we make a dedicated action for removing the weed instead of doing it automatically?
        # JW: Perhaps adding a cost of removing the weed since real drones will have limited herbicide and should be discouraged from wasting it
        infected_visited = [x for x in self.agent_positions if tuple(x) in self.infected_locations] # If infected cells are visited
        for v in infected_visited:
            if tuple(v) in self.infected_locations:
                self.infected_locations.remove(tuple(v))
                self.infected_dict[tuple(v)] = 1
        
        if infected_visited:
            rewards += 100 * len(infected_visited) 

        if sum(list(self.infected_dict.values()))==self.infected_length:
            rewards += 100000
            terminated = True

        
        # If the agents meet at the same position, we can assign a reward or consider it a terminal state
        if np.array_equal(self.agent_positions[0], self.agent_positions[1]) or np.array_equal(self.agent_positions[0], self.agent_positions[2]) or np.array_equal(self.agent_positions[1], self.agent_positions[2]):
            rewards -= 100000  # Infinity reward for meeting at the same position
            terminated = True
        
        obs, info = self._get_obs()
        # rewards = rewards * self.gamma ** self.step_count
        return obs, rewards, terminated, truncated, info

    def render(self):
        if self.render_mode == 'print':
            grid = np.zeros(self.grid_size)
            grid[tuple(self.agent_positions[0])] = 1  # Mark the position of the first agent
            grid[tuple(self.agent_positions[1])] = 2  # Mark the position of the second agent
            print(grid)
        else:
            if self.window is None and self.render_mode == "human": # Initialize pygame if it is not initialized
                pygame.init()
                pygame.display.init()
                self.window = pygame.display.set_mode(
                    (self.window_size, self.window_size)
                )
            if self.clock is None and self.render_mode == "human":
                self.clock = pygame.time.Clock()
            
            # Fill the canvas
            canvas = pygame.Surface((self.window_size, self.window_size))
            canvas.fill((255, 255, 255))
            pix_square_size = (
                self.window_size / self.size
            )  # The size of a single grid square in pixels

            # Draw the polygon
            pixel_poly_vertices = [(point[0] * pix_square_size, point[1] * pix_square_size) for point in self.poly_vertices]
            pygame.draw.polygon(surface=canvas, 
                                color=(255, 255, 0), 
                                points=pixel_poly_vertices)
            
            # Draw the visited regions
            for p in self.visited:
                pygame.draw.rect(
                canvas,
                pygame.Color(100, 100, 100, a=0.5),
                pygame.Rect(
                    pix_square_size * np.array(p),
                    (pix_square_size, pix_square_size),
                ),
                )
            # Draw agent1 (square)
            pygame.draw.rect(
                canvas,
                (255, 0, 0),
                pygame.Rect(
                    pix_square_size * self.agent_positions[0],
                    (pix_square_size, pix_square_size),
                ),
            )
            # Draw agent2 (circle)
            pygame.draw.circle(
                canvas,
                (0, 0, 255),
                (self.agent_positions[1] + 0.5) * pix_square_size,
                pix_square_size / 3,
            )
            # Draw agent3 (circle)
            pygame.draw.circle(
                canvas,
                (0, 255, 0),
                (self.agent_positions[2] + 0.5) * pix_square_size,
                pix_square_size / 3,
            )
            # Draw infected locations
            for l in self.infected_locations:
                pygame.draw.rect(
                    canvas,
                    (0, 255, 255),
                    pygame.Rect(
                        pix_square_size * np.array(l),
                        (pix_square_size, pix_square_size),
                    ),
                )


            if self.render_mode == "human":
                # The following line copies our drawings from `canvas` to the visible window
                self.window.blit(canvas, canvas.get_rect())
                pygame.event.pump()
                pygame.display.update()

                # We need to ensure that human-rendering occurs at the predefined framerate.
                # The following line will automatically add a delay to keep the framerate stable.
                self.clock.tick(self.metadata["render_fps"])
                # Finally
                pygame.event.get()

            elif self.render_mode == 'rgb_array':  # rgb_array
                return np.transpose(
                    np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
                )
            
    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

# Register the environment
gym.envs.registration.register(
    id='ThreeAgentGridworld-v0',
    entry_point=ThreeAgentGridworldEnv,
    max_episode_steps=2000,
)

Create the environment and random play (Optional):

In [11]:
# Cell 5
# Example of creating and using the environment
env = gym.make('ThreeAgentGridworld-v0', render_mode='human')
env.metadata['render_fps'] = 30
obs, info = env.reset()
env.render()

# Random play
env.reset()
terminated, truncated = False, False
total_rewards = 0
while not (terminated or truncated):
    action = env.action_space.sample()
    print(action)
    obs, reward, terminated, truncated,  info = env.step(action)
    env.render()
    total_rewards += reward
    print(f"Obs: {obs}, Reward: {reward}, terminated: {terminated}, total_rewards: {total_rewards}")
    pygame.event.get()
print('terminated:', terminated, 'truncated:', truncated)

[4 2 1]
Obs: [ 26 495 105   0], Reward: -3, terminated: False, total_rewards: -3
[0 4 0]
Obs: [602 495 627   0], Reward: -12, terminated: False, total_rewards: -15
[2 2 2]
Obs: [675 555 134   0], Reward: -3, terminated: False, total_rewards: -18
[1 2 4]
Obs: [ 89  71 134   0], Reward: -12, terminated: False, total_rewards: -30
[3 1 4]
Obs: [ 26 405 134   0], Reward: -21, terminated: False, total_rewards: -51
[4 4 3]
Obs: [ 26 405 627   0], Reward: -30, terminated: False, total_rewards: -81
[3 4 0]
Obs: [513 405 296   0], Reward: -12, terminated: False, total_rewards: -93
[4 3 3]
Obs: [513  77 225   0], Reward: -12, terminated: False, total_rewards: -105
[1 2 2]
Obs: [ 35 405 296   0], Reward: -21, terminated: False, total_rewards: -126
[1 3 3]
Obs: [572  77 225   0], Reward: -21, terminated: False, total_rewards: -147
[4 1 3]
Obs: [572 364 729   0], Reward: -12, terminated: False, total_rewards: -159
[3 3 0]
Obs: [249 293 397   0], Reward: -3, terminated: False, total_rewards: -162
[2 

In [13]:
# Cell 6
# Close the environment
env.close()
# Set a breakpoint, will give an error
assert False, "breakpoint"

# Training cells!

Advantage actor-critic:

In [None]:
# Cell 7
# # keep track of running time
# import time
# start = time.time()

# # Advantage Actor Critic
# from stable_baselines3 import A2C
# from stable_baselines3.common.env_util import make_vec_env

# vec_env = make_vec_env('ThreeAgentGridworld-v0', n_envs=4)

# model = A2C("MlpPolicy", vec_env, verbose=1, tensorboard_log="./a2c_log_w2", gamma=0.99)
# model.learn(total_timesteps=1000000)
# model.save("multi_agent_a2c_w2")
# del model, A2C, make_vec_env
# end = time.time()
# duration = end-start
# print("Time taken for training:", duration)

Using cuda device
Logging to ./a2c_log_w2\A2C_1
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 201       |
|    ep_rew_mean        | -1.05e+05 |
| time/                 |           |
|    fps                | 648       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 2000      |
| train/                |           |
|    entropy_loss       | -4.82     |
|    explained_variance | -0.000163 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -249      |
|    value_loss         | 6.13e+03  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 302       |
|    ep_rew_mean        | -1.07e+05 |
| time/                 |           |
|    fps                | 691       |
|    iterations         | 200       |
|    time_elapsed       | 5         |
| 

Proximal Policy Optimization

In [None]:
# Cell 8
# # Proximal Policy Optimization
# from stable_baselines3 import PPO
# from stable_baselines3.common.env_util import make_vec_env

# # Training cell
# vec_env = make_vec_env('ThreeAgentGridworld-v0', n_envs=4)

# model = PPO("MlpPolicy", vec_env, verbose=1, tensorboard_log="./ppo_log1", gamma=0.99)
# model.learn(total_timesteps=300000)
# model.save("multi_agent_ppo_w1")
# del model, PPO, make_vec_env

# Inference

Load trained network:

In [6]:
# Cell 9 
import pygame
import gymnasium as gym
from stable_baselines3 import A2C # For PPO, need to change this

# Load trained network
model = A2C.load("trained_models/multi_agent_a2c_w2.zip")

Play using trained network and default env (we can also use vector env):

In [9]:
# Cell 10
# Make the environment
env = gym.make('ThreeAgentGridworld-v0', render_mode='human')
env.metadata['render_fps'] = 30
obs, info = env.reset()
env.render()

# Start playing
terminated, truncated = False, False
total_rewards = 0
while not (terminated or truncated):
    action, _ = model.predict(obs)
    obs, reward, terminated, truncated,  info = env.step(list(action))
    env.render()
    total_rewards += reward
    print(f"Obs: {obs}, Reward: {reward}, terminated: {terminated}, total_rewards: {total_rewards}")
    pygame.event.get()
print('terminated:', terminated, 'truncated:', truncated)

Obs: [ 29 694 134   0], Reward: -3, terminated: False, total_rewards: -3
Obs: [ 90 680 198   0], Reward: -3, terminated: False, total_rewards: -6
Obs: [419 344 270   0], Reward: -3, terminated: False, total_rewards: -9
Obs: [483 613 592   0], Reward: -3, terminated: False, total_rewards: -12
Obs: [812 280 664   2], Reward: 97, terminated: False, total_rewards: 85
Obs: [ 62  58 162   2], Reward: -3, terminated: False, total_rewards: 82
Obs: [380 800 231   2], Reward: -3, terminated: False, total_rewards: 79
Obs: [452 240 305   2], Reward: -3, terminated: False, total_rewards: 76
Obs: [513 737 624   2], Reward: -3, terminated: False, total_rewards: 73
Obs: [ 26 404 699   2], Reward: -3, terminated: False, total_rewards: 70
Obs: [ 89 158 195   2], Reward: -3, terminated: False, total_rewards: 67
Obs: [416  94 267   2], Reward: -3, terminated: False, total_rewards: 64
Obs: [481 361 590   2], Reward: -3, terminated: False, total_rewards: 61
Obs: [809  36 661   2], Reward: -3, terminated: Fa

In [30]:
# Cell 11
env.close()

## Simulation

**IMPORTANT**: open the scene file using CoppeliaSim robot simulator before running the below cells

In [7]:
# Cell 12
# Simulation class
import time
from coppeliasim_zmqremoteapi_client import RemoteAPIClient
import numpy as np

# Start the remote API client
client = RemoteAPIClient()
sim = client.getObject('sim')
defaultIdleFps = sim.getInt32Param(sim.intparam_idle_fps)
sim.setInt32Param(sim.intparam_idle_fps, 0)

# Height of movement
height = 0.35

class Drone_simulator:    
    def __init__(self, polygon, scaling_factor, height):
        self.scaling_factor = scaling_factor
        self.scaled_polygon = [(x/scaling_factor,y/scaling_factor) for (x,y) in polygon]
        self.rounded_polygon = self.scaled_polygon + [self.scaled_polygon[0]]
        self.color = [[255,0,0],[255,0,255],[0,0,255]]
        self.edges_3d = self.calc_edges_3d()
        self.height = height

    def start_simulation(self):
        self.trace_line = sim.addDrawingObject(sim.drawing_lines, 2, 0, -1, 9999, [255,0,0]) # red line
        sim.startSimulation()
        print('Program started')

    def stop_simulation(self):
        sim.removeDrawingObject(self.trace_line)
        sim.stopSimulation()

    def calc_edges_3d(self):  # To calculate the edges in the polygon
        edges = []
        for i in range(len(self.rounded_polygon) - 1):
            edges.append([list(self.rounded_polygon[i]), list(self.rounded_polygon[i+1])])
        return edges

    def draw_field(self):
        white = [255, 255, 255]
        lineContainer = sim.addDrawingObject(sim.drawing_lines, 2, 0, -1, 9999, white)
        for l in self.edges_3d: # Drawing the field with white lines
            line = l[0] + [self.height] + l[1] + [self.height]
            for j in range(len(line)):
                if line[j] != self.height:
                    line[j] = int(line[j])
            # print(line)
            sim.addDrawingObjectItem(lineContainer, line)

    def set_agent_positions(self, k, info):
        for i in range(k):
            drone = '/Quadcopter['
            obj_path = drone+str(i)+']'
            objHandle = sim.getObject(obj_path)
            print(np.append(info['agent'+str(i+1)],[self.height]))
            x = info['agent'+str(i+1)]
            x = [xi/self.scaling_factor for xi in x]
            x = x + [self.height]
            print(x)
            sim.setObjectPosition(objHandle, -1, x) # Initiate the position of the robots
    
    def set_weed_locations(self, weed_locations):
        weed_obj = sim.getObject('/weed')
        for i, loc in enumerate(weed_locations):
            new_weed_obj = sim.copyPasteObjects([weed_obj])[0]
            x = [xi/self.scaling_factor for xi in loc]
            new_pos = x + [0]
            sim.setObjectPosition(new_weed_obj, -1, new_pos)

    def move_agents(self, k, info):
        for i in range(k):
            obj_path = '/target[' + str(i) + ']'
            objHandle = sim.getObject(obj_path)
            prev_pos = sim.getObjectPosition(objHandle, -1) # current object position
            print(np.append(info['agent'+str(i+1)],[self.height]))
            x = info['agent'+str(i+1)] # Get the x,y from info of gym env
            x = [xi/self.scaling_factor for xi in x] # scale the x,y
            x = x + [self.height] # add the z (height)
            # print(x)
            sim.setObjectPosition(objHandle, -1, x) # Initiate the position of the robots
            # draw the line
            line_data = prev_pos + x
            sim.addDrawingObjectItem(self.trace_line, line_data)

Simulation using random movement

In [7]:
# Cell 13
# Make the environment
env = gym.make('ThreeAgentGridworld-v0', render_mode='human')
env.metadata['render_fps'] = 5
obs, info = env.reset()
env.render()

drone_simulator = Drone_simulator(polygon=env.poly_vertices, scaling_factor=5, height=height)
drone_simulator.draw_field()
drone_simulator.set_agent_positions(k=3, info=info)
drone_simulator.set_weed_locations(weed_locations=env.infected_locations)

# Start the simulator
drone_simulator.start_simulation()
terminated, truncated = False, False
total_rewards = 0
while not (terminated or truncated):
    action = env.action_space.sample()
    obs, reward, terminated, truncated,  info = env.step(action)
    env.render()
    total_rewards += reward
    print(f"Obs: {obs}, Reward: {reward}, terminated: {terminated}, total_rewards: {total_rewards}, action: {action}")
    pygame.event.get()
    drone_simulator.move_agents(k=3, info=info) # Simulate
print('terminated:', terminated, 'truncated:', truncated)

  logger.warn(
  logger.warn(


[14.   34.    0.35]
[np.float64(2.8), np.float64(6.8), 0.35]
[40.   25.    0.35]
[np.float64(8.0), np.float64(5.0), 0.35]
[20.   40.    0.35]
[np.float64(4.0), np.float64(8.0), 0.35]
Program started
Obs: [775 495 134   0], Reward: -3, terminated: False, total_rewards: -3, action: [4 2 2]
[14.   34.    0.35]
[40.   24.    0.35]
[20.   39.    0.35]
Obs: [775 495 134   0], Reward: -30, terminated: False, total_rewards: -33, action: [4 4 4]
[14.   34.    0.35]
[40.   24.    0.35]
[20.   39.    0.35]
Obs: [ 29 193 627   0], Reward: -3, terminated: False, total_rewards: -36, action: [2 0 3]
[14.   33.    0.35]
[39.   24.    0.35]
[20.   40.    0.35]
Obs: [606 262 134   0], Reward: -12, terminated: False, total_rewards: -48, action: [0 2 2]
[13.   33.    0.35]
[39.   23.    0.35]
[20.   39.    0.35]
Obs: [113 262 198   0], Reward: -12, terminated: False, total_rewards: -60, action: [2 4 2]
[13.   32.    0.35]
[39.   23.    0.35]
[20.   38.    0.35]
Obs: [175 555 134   0], Reward: -12, termina

Simulation using trained network

In [8]:
# Cell 14
# Make the environment
env = gym.make('ThreeAgentGridworld-v0', render_mode='human')
env.metadata['render_fps'] = 5
obs, info = env.reset()
env.render()

# Make the simulator object, draw the field, and set agent positions
drone_simulator = Drone_simulator(polygon=env.poly_vertices, scaling_factor=5, height=height)
drone_simulator.draw_field()
drone_simulator.set_agent_positions(k=3, info=info)
drone_simulator.set_weed_locations(weed_locations=env.infected_locations)

# Start simulation
drone_simulator.start_simulation()
terminated, truncated = False, False
total_rewards = 0
while not (terminated or truncated):
    action, _ = model.predict(obs)
    obs, reward, terminated, truncated,  info = env.step(list(action))
    env.render()
    total_rewards += reward
    print(f"Obs: {obs}, Reward: {reward}, terminated: {terminated}, total_rewards: {total_rewards}, action: {action}")
    pygame.event.get()
    drone_simulator.move_agents(k=3, info=info) # Simulate
print('terminated:', terminated, 'truncated:', truncated)
# drone_simulator.stop_simulation()
# env.close()

  logger.warn(
  logger.warn(


[14.   34.    0.35]
[np.float64(2.8), np.float64(6.8), 0.35]
[40.   25.    0.35]
[np.float64(8.0), np.float64(5.0), 0.35]
[20.   40.    0.35]
[np.float64(4.0), np.float64(8.0), 0.35]
Program started
Obs: [ 29 694 134   0], Reward: -3, terminated: False, total_rewards: -3, action: [2 0 2]
[14.   33.    0.35]
[39.   25.    0.35]
[20.   39.    0.35]
Obs: [ 90 680 198   0], Reward: -3, terminated: False, total_rewards: -6, action: [2 0 2]
[14.   32.    0.35]
[38.   25.    0.35]
[20.   38.    0.35]
Obs: [419 344 270   0], Reward: -3, terminated: False, total_rewards: -9, action: [2 0 2]
[14.   31.    0.35]
[37.   25.    0.35]
[20.   37.    0.35]
Obs: [483 613 592   0], Reward: -3, terminated: False, total_rewards: -12, action: [2 0 2]
[14.   30.    0.35]
[36.   25.    0.35]
[20.   36.    0.35]
Obs: [812 280 664   2], Reward: 97, terminated: False, total_rewards: 85, action: [2 0 2]
[14.   29.    0.35]
[35.   25.    0.35]
[20.   35.    0.35]
Obs: [ 62  58 162   2], Reward: -3, terminated: Fa

In [None]:
# Cell 15
drone_simulator.stop_simulation()
env.close()