In [None]:
from typing import Dict, Tuple
import os

import torch
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation import Episode, RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch

import gym
from gym import wrappers
import ray
import numpy as np
from scipy.interpolate import interp1d
import pickle

import matplotlib.pyplot as plt
from torchsummary import summary

import io
import base64
from IPython.display import HTML

from src.envs.multiple_particles_in_flow_continuous import MultipleParticlesInFlowContinuous
from src.envs.multiple_particles_in_flow_discrete import MultipleParticlesInFlowDiscrete
from src.envs.multiple_particles_in_flow_delayed_obs_continuous import MultipleParticlesInFlowDelayedObsContinuous
from src.envs.multiple_particles_in_flow_delayed_obs_discrete import MultipleParticlesInFlowDelayedObsDiscrete

%matplotlib inline
%load_ext autoreload
%autoreload 2

# Useful Functions

In [None]:
def test_and_plot(env, policy, lstm=False):
    obs = env.reset()
    done = False
    total_reward = 0
    step = 0

    obs_hist = [obs]
    particle_hist = [env.cur_particles]
    state_hist = [env.cur_state]
    action_hist = []
    if lstm:
        lstm_state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]
        lstm_state_hist = [lstm_state]
    action = [0]

    while not done:
        if lstm:
            action, lstm_state, _ = policy(obs, lstm_state, explore=False)
        else:
            action = policy(obs, explore=False)
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        step += 1
        obs_hist.append(obs)
        action_hist.append(action)
        particle_hist.append(env.cur_particles)
        state_hist.append(env.cur_state)
        if lstm:
            lstm_state_hist.append(lstm_state)

    print(f"Played 1 episode; total-reward={total_reward}; Number of steps: {step}")

    particle_hist = np.array(particle_hist)
    state_hist = np.array(state_hist)
    action_hist = np.array(action_hist)
    obs_hist = np.array(obs_hist)

    act_x, act_y = np.meshgrid(np.linspace(0,15,151),np.linspace(0,1.2,15))
    act_mag = env.jet_dist(act_x, act_y)

    fig = plt.figure(figsize=(16,8))

    ax = fig.add_subplot(211)
    ax.plot(particle_hist[:,:,0], particle_hist[:,:,1], c='r', lw=0.1)
    ax.scatter(particle_hist[0,:,0], particle_hist[0,:,1], s=5, c='r')
    ax.scatter(particle_hist[-1,:,0], particle_hist[-1,:,1], s=5, c='r')
    ax.plot(state_hist[:,0], state_hist[:,1], c='darkred', lw=2)
    # ax.plot(obs_hist[:,0], obs_hist[:,1], c='r', label='Observation')
    # ax.plot(action_region_x, action_region_y, c='orange', label='Force Field')
    ax.contourf(act_x, act_y, act_mag, vmin=-1.6, vmax=1.5, cmap='RdBu', zorder=-10)
    ax.plot(state_hist[:-1,0], action_hist, c='k', label='Action')
    ax.grid(True)
    ax.set_aspect('equal', 'box')
    ax.set_xlim([0,15])
    ax.set_ylim([0,1.2])

    ax = fig.add_subplot(212)
    ax.plot(action_hist, c='k', label='Action')
    ax.plot(state_hist[:-1,1], c='darkred', label='Mean y')
    ax.plot(obs_hist, c='b', label='Action')
    ax.set_xlim([0,200])
    ax.set_ylim([0,1.2])
    ax.grid()
    
    return ax

In [None]:
def plot_results(train_episodes, train_reward, train_downwash, train_actuation):
    fig = plt.figure(figsize=[16,4])
    ax = fig.add_subplot(131)
    ax.plot(train_episodes, train_reward, c='r', label='Average Reward')
    ax.legend()
    ax.grid()

    ax = fig.add_subplot(132)
    ax.plot(train_episodes, train_downwash, c='b', label='Average Downwash')
    ax.legend()
    ax.grid()

    ax = fig.add_subplot(133)
    ax.plot(train_episodes, train_actuation, c='k', label='Average Actuation')
    ax.legend()
    ax.grid()
    
    return ax

In [None]:
def const_policy_cont(a, explore=False):
    return [1]
def const_policy_disc(a, explore=False):
    return 1

## Callbacks

In [None]:
class MyCallbacks(DefaultCallbacks):
    def on_episode_start(
        self,
        *,
        worker: RolloutWorker,
        base_env: BaseEnv,
        policies: Dict[str, Policy],
        episode: Episode,
        env_index: int,
        **kwargs
    ):
        # Make sure this episode has just been started (only initial obs
        # logged so far).
        assert episode.length == 0, (
            "ERROR: `on_episode_start()` callback should be called right "
            "after env reset!"
        )
        episode.user_data["reward_downwash"] = []
        episode.hist_data["reward_downwash"] = []
        episode.user_data["reward_actuation"] = []
        episode.hist_data["reward_actuation"] = []
        
    def on_episode_step(
        self,
        *,
        worker: RolloutWorker,
        base_env: BaseEnv,
        policies: Dict[str, Policy],
        episode: Episode,
        env_index: int,
        **kwargs
    ):
        # Make sure this episode is ongoing.
        assert episode.length > 0, (
            "ERROR: `on_episode_step()` callback should not be called right "
            "after env reset!"
        )
        reward_downwash = base_env.get_sub_environments()[0].cur_reward_downwash
        reward_actuation = -base_env.get_sub_environments()[0].cur_reward_actuation

        episode.user_data["reward_downwash"].append(reward_downwash)
        episode.user_data["reward_actuation"].append(reward_actuation)

    def on_episode_end(
        self,
        *,
        worker: RolloutWorker,
        base_env: BaseEnv,
        policies: Dict[str, Policy],
        episode: Episode,
        env_index: int,
        **kwargs
    ):
        # Check if there are multiple episodes in a batch, i.e.
        # "batch_mode": "truncate_episodes".
        if worker.policy_config["batch_mode"] == "truncate_episodes":
            # Make sure this episode is really done.
            assert episode.batch_builder.policy_collectors["default_policy"].batches[
                -1
            ]["dones"][-1], (
                "ERROR: `on_episode_end()` should only be called "
                "after episode is done!"
            )
        reward_downwash = np.mean(episode.user_data["reward_downwash"])
        reward_actuation = np.mean(episode.user_data["reward_actuation"])

        episode.custom_metrics["reward_downwash"] = reward_downwash
        episode.custom_metrics["reward_actuation"] = reward_actuation
        episode.hist_data["reward_downwash"] = episode.user_data["reward_downwash"]
        episode.hist_data["reward_actuation"] = episode.user_data["reward_actuation"]

---------------------------------------------------------------------
# Shoot down MULTIPLE particle - Partially Observed - Maximize Downwash
---------------------------------------------------------------------

## Global Configurations

## PPO + LSTM - Discrete - W = 5

In [None]:
test_name = "act_cost_5_n_cells_1_1"

global_env_config = {
    "dt": 0.1,
    "xc": 9.0,
    "yc": 0.0,
    "sigma_x": 0.6,
    "sigma_y": 0.5,
    "gain": 0.25,
    "act_cost_weight": 5.0,
    "obs_min": [7.0, 0.1],
    "obs_max": [8.0, 0.5],
    "n_particles": 400,
    "lamb": 0.1,
    "n_cells": [1,1]
}

env_config = global_env_config.copy()

for random_seed in range(100,105):
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    lstm_cell_size = 16
    trainer_ppo_lstm_disc = PPOTrainer(
        config={
            "env": MultipleParticlesInFlowDiscrete,
            "env_config": env_config,
            "framework": "torch",
            "train_batch_size": 2*1024,
            "callbacks": MyCallbacks,
            # Parallel rollouts
    #         "num_workers": 8,
            "num_gpus": 1,
            "lr": 0.00005,
            "clip_param": 0.99,
            "log_level": "INFO",
            "model": {
                "use_lstm": True,
                "lstm_cell_size": lstm_cell_size,
                "lstm_use_prev_action": False,
                "lstm_use_prev_reward": False,
                "max_seq_len": 40,
                "fcnet_hiddens": [256, 256],
            },
        }
    )

    train_reward_hist_lstm_disc = []
    train_downwash_hist_lstm_disc = []
    train_actuation_hist_lstm_disc = []
    train_episodes_total_lstm_disc = []

    for i in range(400):
        results = trainer_ppo_lstm_disc.train()
        train_reward_hist_lstm_disc.append(results['episode_reward_mean'])
        train_downwash_hist_lstm_disc.append(results['custom_metrics']['reward_downwash_mean'])
        train_actuation_hist_lstm_disc.append(results['custom_metrics']['reward_actuation_mean'])
        train_episodes_total_lstm_disc.append(results['episodes_total'])
        print(f"i={i}; reward={results['episode_reward_mean']}; downwash={results['custom_metrics']['reward_downwash_mean']}; actuation={results['custom_metrics']['reward_actuation_mean']}; episodes={results['episodes_this_iter']}; total episodes={results['episodes_total']}")
        if i % 10 == 9:
            trainer_ppo_lstm_disc.save_checkpoint(trainer_ppo_lstm_disc.logdir)

    # Save results to file
    train_lstm_disc_results = {
        "train_reward": train_reward_hist_lstm_disc,
        "train_downwash": train_downwash_hist_lstm_disc,
        "train_actuation": train_actuation_hist_lstm_disc,
        "train_episodes": train_episodes_total_lstm_disc
    }

    filename = f"ray_results/ppo_lstm_discrete_{test_name}/train_results_{random_seed}.pkl"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(train_lstm_disc_results, f)
    trainer_ppo_lstm_disc.save_checkpoint(f"ray_results/ppo_lstm_discrete_{test_name}")
    
    plot_results(train_episodes_total_lstm_disc, train_reward_hist_lstm_disc, train_downwash_hist_lstm_disc, train_actuation_hist_lstm_disc)
    

## PPO + LSTM - Discrete - W = 10

In [None]:
test_name = "act_cost_10_n_cells_1_1"

global_env_config = {
    "dt": 0.1,
    "xc": 9.0,
    "yc": 0.0,
    "sigma_x": 0.6,
    "sigma_y": 0.5,
    "gain": 0.25,
    "act_cost_weight": 10.0,
    "obs_min": [7.0, 0.1],
    "obs_max": [8.0, 0.5],
    "n_particles": 400,
    "lamb": 0.1,
    "n_cells": [1,1]
}

env_config = global_env_config.copy()

for random_seed in range(100,105):
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    lstm_cell_size = 16
    trainer_ppo_lstm_disc = PPOTrainer(
        config={
            "env": MultipleParticlesInFlowDiscrete,
            "env_config": env_config,
            "framework": "torch",
            "train_batch_size": 2*1024,
            "callbacks": MyCallbacks,
            # Parallel rollouts
    #         "num_workers": 8,
            "num_gpus": 1,
            "lr": 0.00005,
            "clip_param": 0.99,
            "log_level": "INFO",
            "model": {
                "use_lstm": True,
                "lstm_cell_size": lstm_cell_size,
                "lstm_use_prev_action": False,
                "lstm_use_prev_reward": False,
                "max_seq_len": 40,
                "fcnet_hiddens": [256, 256],
            },
        }
    )

    train_reward_hist_lstm_disc = []
    train_downwash_hist_lstm_disc = []
    train_actuation_hist_lstm_disc = []
    train_episodes_total_lstm_disc = []

    for i in range(400):
        results = trainer_ppo_lstm_disc.train()
        train_reward_hist_lstm_disc.append(results['episode_reward_mean'])
        train_downwash_hist_lstm_disc.append(results['custom_metrics']['reward_downwash_mean'])
        train_actuation_hist_lstm_disc.append(results['custom_metrics']['reward_actuation_mean'])
        train_episodes_total_lstm_disc.append(results['episodes_total'])
        print(f"i={i}; reward={results['episode_reward_mean']}; downwash={results['custom_metrics']['reward_downwash_mean']}; actuation={results['custom_metrics']['reward_actuation_mean']}; episodes={results['episodes_this_iter']}; total episodes={results['episodes_total']}")
        if i % 10 == 9:
            trainer_ppo_lstm_disc.save_checkpoint(trainer_ppo_lstm_disc.logdir)

    # Save results to file
    train_lstm_disc_results = {
        "train_reward": train_reward_hist_lstm_disc,
        "train_downwash": train_downwash_hist_lstm_disc,
        "train_actuation": train_actuation_hist_lstm_disc,
        "train_episodes": train_episodes_total_lstm_disc
    }

    filename = f"ray_results/ppo_lstm_discrete_{test_name}/train_results_{random_seed}.pkl"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(train_lstm_disc_results, f)
    trainer_ppo_lstm_disc.save_checkpoint(f"ray_results/ppo_lstm_discrete_{test_name}")
    
    plot_results(train_episodes_total_lstm_disc, train_reward_hist_lstm_disc, train_downwash_hist_lstm_disc, train_actuation_hist_lstm_disc)
    

## PPO + LSTM - Discrete - W = 20

In [None]:
test_name = "act_cost_20_n_cells_1_1"

global_env_config = {
    "dt": 0.1,
    "xc": 9.0,
    "yc": 0.0,
    "sigma_x": 0.6,
    "sigma_y": 0.5,
    "gain": 0.25,
    "act_cost_weight": 20.0,
    "obs_min": [7.0, 0.1],
    "obs_max": [8.0, 0.5],
    "n_particles": 400,
    "lamb": 0.1,
    "n_cells": [1,1]
}

env_config = global_env_config.copy()

for random_seed in range(100,105):
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    lstm_cell_size = 16
    trainer_ppo_lstm_disc = PPOTrainer(
        config={
            "env": MultipleParticlesInFlowDiscrete,
            "env_config": env_config,
            "framework": "torch",
            "train_batch_size": 2*1024,
            "callbacks": MyCallbacks,
            # Parallel rollouts
    #         "num_workers": 8,
            "num_gpus": 1,
            "lr": 0.00005,
            "clip_param": 0.99,
            "log_level": "INFO",
            "model": {
                "use_lstm": True,
                "lstm_cell_size": lstm_cell_size,
                "lstm_use_prev_action": False,
                "lstm_use_prev_reward": False,
                "max_seq_len": 40,
                "fcnet_hiddens": [256, 256],
            },
        }
    )

    train_reward_hist_lstm_disc = []
    train_downwash_hist_lstm_disc = []
    train_actuation_hist_lstm_disc = []
    train_episodes_total_lstm_disc = []

    for i in range(400):
        results = trainer_ppo_lstm_disc.train()
        train_reward_hist_lstm_disc.append(results['episode_reward_mean'])
        train_downwash_hist_lstm_disc.append(results['custom_metrics']['reward_downwash_mean'])
        train_actuation_hist_lstm_disc.append(results['custom_metrics']['reward_actuation_mean'])
        train_episodes_total_lstm_disc.append(results['episodes_total'])
        print(f"i={i}; reward={results['episode_reward_mean']}; downwash={results['custom_metrics']['reward_downwash_mean']}; actuation={results['custom_metrics']['reward_actuation_mean']}; episodes={results['episodes_this_iter']}; total episodes={results['episodes_total']}")
        if i % 10 == 9:
            trainer_ppo_lstm_disc.save_checkpoint(trainer_ppo_lstm_disc.logdir)

    # Save results to file
    train_lstm_disc_results = {
        "train_reward": train_reward_hist_lstm_disc,
        "train_downwash": train_downwash_hist_lstm_disc,
        "train_actuation": train_actuation_hist_lstm_disc,
        "train_episodes": train_episodes_total_lstm_disc
    }

    filename = f"ray_results/ppo_lstm_discrete_{test_name}/train_results_{random_seed}.pkl"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(train_lstm_disc_results, f)
    trainer_ppo_lstm_disc.save_checkpoint(f"ray_results/ppo_lstm_discrete_{test_name}")
    
    plot_results(train_episodes_total_lstm_disc, train_reward_hist_lstm_disc, train_downwash_hist_lstm_disc, train_actuation_hist_lstm_disc)
    

## PPO + LSTM - Discrete - W = 1

In [None]:
test_name = "act_cost_1_n_cells_1_1"

global_env_config = {
    "dt": 0.1,
    "xc": 9.0,
    "yc": 0.0,
    "sigma_x": 0.6,
    "sigma_y": 0.5,
    "gain": 0.25,
    "act_cost_weight": 1.0,
    "obs_min": [7.0, 0.1],
    "obs_max": [8.0, 0.5],
    "n_particles": 400,
    "lamb": 0.1,
    "n_cells": [1,1]
}

env_config = global_env_config.copy()

for random_seed in range(100,105):
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    lstm_cell_size = 16
    trainer_ppo_lstm_disc = PPOTrainer(
        config={
            "env": MultipleParticlesInFlowDiscrete,
            "env_config": env_config,
            "framework": "torch",
            "train_batch_size": 2*1024,
            "callbacks": MyCallbacks,
            # Parallel rollouts
    #         "num_workers": 8,
            "num_gpus": 1,
            "lr": 0.00005,
            "clip_param": 0.99,
            "log_level": "INFO",
            "model": {
                "use_lstm": True,
                "lstm_cell_size": lstm_cell_size,
                "lstm_use_prev_action": False,
                "lstm_use_prev_reward": False,
                "max_seq_len": 40,
                "fcnet_hiddens": [256, 256],
            },
        }
    )

    train_reward_hist_lstm_disc = []
    train_downwash_hist_lstm_disc = []
    train_actuation_hist_lstm_disc = []
    train_episodes_total_lstm_disc = []

    for i in range(400):
        results = trainer_ppo_lstm_disc.train()
        train_reward_hist_lstm_disc.append(results['episode_reward_mean'])
        train_downwash_hist_lstm_disc.append(results['custom_metrics']['reward_downwash_mean'])
        train_actuation_hist_lstm_disc.append(results['custom_metrics']['reward_actuation_mean'])
        train_episodes_total_lstm_disc.append(results['episodes_total'])
        print(f"i={i}; reward={results['episode_reward_mean']}; downwash={results['custom_metrics']['reward_downwash_mean']}; actuation={results['custom_metrics']['reward_actuation_mean']}; episodes={results['episodes_this_iter']}; total episodes={results['episodes_total']}")
        if i % 10 == 9:
            trainer_ppo_lstm_disc.save_checkpoint(trainer_ppo_lstm_disc.logdir)

    # Save results to file
    train_lstm_disc_results = {
        "train_reward": train_reward_hist_lstm_disc,
        "train_downwash": train_downwash_hist_lstm_disc,
        "train_actuation": train_actuation_hist_lstm_disc,
        "train_episodes": train_episodes_total_lstm_disc
    }

    filename = f"ray_results/ppo_lstm_discrete_{test_name}/train_results_{random_seed}.pkl"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(train_lstm_disc_results, f)
    trainer_ppo_lstm_disc.save_checkpoint(f"ray_results/ppo_lstm_discrete_{test_name}")
    
    plot_results(train_episodes_total_lstm_disc, train_reward_hist_lstm_disc, train_downwash_hist_lstm_disc, train_actuation_hist_lstm_disc)
    