## 1. Getting started <a class="anchor" id="first-section"></a>

### 1.1 Launch simulator (`BlueSky`) and the `BlueBird` interface layer

If you have not already started the BlueSky simulator and BlueBird, you can launch them using the command below:

In [8]:
!docker-compose --file ../docker-compose.yml up --detach
import pydodo
import gym
import time
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from maddpg_agent import Agent
%matplotlib inline
from gym import spaces
from gym.utils import seeding
import itertools
from pydodo.episode_log import episode_log
from pydodo import change_altitude
from pydodo.metrics import loss_of_separation
from pydodo.request_position import all_positions
from pydodo.simulation_control import simulation_step, reset_simulation, pause_simulation


#print(pydodo.bluebird_connect.get_bluebird_url())
#help(pydodo.bluebird_config)
pydodo.all_positions()
pydodo.simulation_info()
pydodo.upload_sector('sector-X-sector-X-140-400.geojson', 'test_sector')
pydodo.upload_scenario('cartesian_2agent.json', 'test_scenario')

bluesky is up-to-date
bluebird is up-to-date
twitcher is up-to-date


True

In [2]:
def obsr(obs):
    obs = np.array(obs)
    state = obs[:,1:]
    return np.array(state.tolist(),dtype=np.float32)


class SimurghEnv(gym.Env):
    """Simple 2 flight environment

    ...

    """

    def __init__(self):
        # TODO: make sure BlueBird (and BlueSky) are running
        # TODO: make sure simulator mode is "agent"
        # TODO: start scenario

        self.action_space = None
        self.observation_space = None

        self.seed()

        # Start the first episode
        # self.reset()



    def step(self, action):
        """
        Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.

        Accepts an action and returns a tuple (observation, reward, done, info).

        Args:
            action (object): an action provided to the agent

        Returns:
            observation (object): agent's observation of the current environment
            reward (float) : amount of reward returned after previous action
            done (bool): whether the episode has ended, in which case further step() calls will return undefined results
        """
        # assert self.action_space.contains(action)
        if action is not None:
            # print(all_positions().index)
            #print("HELLO WORLD!")
            change_altitude('DELTA210', flight_level=action[0])
            change_altitude('DELTA426', flight_level=action[1])
            
        simulation_step()

        
        #REWARD:
        
        aircraft_pos = all_positions()
        aircraft_pairs = itertools.combinations(aircraft_pos.index, r=2)
        
        separations = np.array([loss_of_separation(acid1, acid2) for acid1, acid2 in aircraft_pairs])
        
        # print(sum(separations))
        reward = -10*abs(separations) if (separations >0 or separations<0) else 10
        #done = (pydodo.sector_exit("DELTA210")==0) or (pydodo.sector_exit("DELTA426")==0) or bool(1-(loss_of_separation!=0))
        done = (pydodo.aircraft_position("DELTA210")['latitude']<50.9).bool() or (pydodo.aircraft_position("DELTA426")['longitude']>0.5).bool() #or bool((loss_of_separation!=0))
        
        print(reward, separations)
        # observation, reward, done, info
        return all_positions(), reward, done

#         val = self.np_random.randint(49, 50)
#         if val == action == 50:
#             # observation, reward, done, info
#             reward = -1.0
#             return 0, reward, False, {}

        # TODO: take action
        # --> probably need to determine some ACTION_MAPPING


        # TODO: get sector exit scores (Simulator not working!!)


        # NOTE: define end of episode as no aircraft in simulation

        # return obs, sum(separations), done, {}

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        Returns:
            observation (object): the initial observation.
        """
        pydodo.reset_simulation()
        pydodo.upload_sector('sector-X-sector-X-140-400.geojson', 'test_sector')
        pydodo.upload_scenario('cartesian_2agent.json', 'test_scenario')
        #reset_simulation()

    def close(self):
        """Override close in the subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        pause_simulation()
        episode_log()



In [3]:
env = SimurghEnv()
env.observation_space = 8
env.action_space = 1
num_agents = 2



In [4]:
agent = Agent(state_size=8, action_size=1, random_seed=10)

In [None]:
#Take some actions
state = pydodo.all_positions()                                 # reset the environment    
scores = np.zeros(num_agents)                          # initialize the score 
for i in range(20):
    action = (100,100)
    #action = env.action_space.sample()             # select an action 
    nxt_state, reward, done = env.step(action)           # send all actions to the environment
    next_state = obsr(nxt_state)
    scores += reward                         # update the score 
    state = next_state
#     if np.any(done):                                  # exit loop if episode finished
#         break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
def ddpg(n_episodes=1500, max_t=1000, print_every=10):
    scores_deque = deque(maxlen=print_every)
    scores = []
    i=0
    for i_episode in range(1, n_episodes+1):
        #Take random actions
        env.reset()
        obs = pydodo.all_positions()                                 # reset the environment  
        state = np.array(obsr(obs))
        action = np.array([60,60])
        agent.reset()
        score = np.zeros(num_agents)
        while True:
            if i % print_every == 0:
                actions =((agent.act(state).reshape(2))/10).astype(int)            # select an action
                action = np.clip(actions, 60, 75)
                print(action)
            #action = (100,200)
            print(action)
            nxt_state, reward, done = env.step(action)           # send all actions to the environment
            #time.sleep(0.1)
            next_state = obsr(nxt_state)
            agent.step(state, action, reward, next_state, done)
            state = np.round(next_state,2)
            score += reward
            i+=1
            if np.any(done):
                break
                
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            
        if np.mean(scores_deque) >=300:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

#  Close environment

In [17]:
!docker-compose down

Stopping twitcher ... 
Stopping bluebird ... 
Stopping bluesky  ... 
[1Bping bluesky  ... [32mdone[0m[2A[2K[1A[2KRemoving twitcher ... 
Removing bluebird ... 
Removing bluesky  ... 
[2BRemoving network simurgh_default[1A[2K
