In [1]:
!pip install svgpath2mpl

[0m

In [2]:
from collections import namedtuple
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adamax
import random
import math 
from svgpath2mpl import parse_path
import matplotlib.pyplot as plt
import matplotlib
from scipy.ndimage import rotate, shift
from matplotlib.animation import FuncAnimation
from probabilistic_fire_env import ProbabilisticFireEnv
from drone_env import DronesEnv
from networks.ppo_net import PPONet
from torch.distributions import MultivariateNormal, Categorical

In [3]:
DT          = 0.5  # Time between wildfire updates            
DTI         = 0.1  # Time between aircraft decisions
n_actions = 2
height = width = 100
channels = 2
EPISODES_PER_BATCH = 1
TRAIN_FREQ  = 10
SAVE_FREQ = 10
GAMMA = 0.95
CLIP  = 0.2
BATCH_SIZE = 64

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

actor = PPONet(device,  channels, height, width, n_actions).to(device)
critic = PPONet(device, channels, height, width, 1).to(device)

optimizer_actor = torch.optim.Adam(params=actor.parameters(), lr=0.001)
optimizer_critic = torch.optim.Adam(params=critic.parameters(), lr=0.001)

cov_var = torch.full(size=(n_actions,), fill_value=0.5, device=device)
cov_mat = torch.diag(cov_var)

In [5]:
def get_action(belief_map, state_vector, hidden):
    action_probs, new_hidden = actor(belief_map, state_vector, hidden)
    dist = Categorical(action_probs)
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return action, log_prob, hidden

In [6]:
Transition = namedtuple('Transition', ('belief_map', 'state_vector', 'action', 'log_probability' ,'reward'))

class EpisodeTransitions:

    def __init__(self):
        self._transitions = []

    def push(self, *args):
        self._transitions.append(Transition(*args))

    def __getitem__(self, index):
        self._transitions[index]

    @property 
    def transitions(self):
        return self._transitions

    def __len__(self):
        return len(self._transitions)+1

class EpisodeMemoryBuffer:

    def __init__(self, capacity=10):
        self.capicity = capacity
        self.memory = []
        
    def push(self, episodeTransitions):
        self.memory.append(episodeTransitions)
    
    @property
    def get_batches(self):
        batches = []
        for episode in self.memory:
            batches.extend([episode.transitions[ i : i + BATCH_SIZE] for i in range(0, len(episode.transitions), BATCH_SIZE)])
            if len(batches[-1]) < BATCH_SIZE:
                batches.pop()
        random.shuffle(batches)
        return batches

    def __len__(self):
        return sum([len(episode) for episode in self.memory])


def compute_reward_to_go(rewards):
    _reward_to_go = []

    discounted_reward = 0

    for reward in reversed(rewards):
        discounted_reward = float(reward.item()) + discounted_reward * GAMMA
        _reward_to_go.insert(0, discounted_reward)
            
    return torch.tensor(_reward_to_go, device=device, dtype=torch.float)





In [7]:
fireEnv = ProbabilisticFireEnv(height, width)
dronesEnv = DronesEnv(height, width, DT, DTI) 

def rollout():

    memory_buffer = EpisodeMemoryBuffer()

    episode_transitions_1 = EpisodeTransitions()
    episode_transitions_2 = EpisodeTransitions()

    episode_i = 0

    observation = fireEnv.reset()
    dronesEnv.reset(observation)

    episode_length = 0 
    hidden_1 = None
    hidden_2 = None
    while episode_i < EPISODES_PER_BATCH:

        for j in range(TRAIN_FREQ//int(2*DT/DTI)):

            observation = fireEnv.step()

            state_vector_1 = dronesEnv.drones[0].state
            map_1 = dronesEnv.drones[0].observation
            state_vector_1 = torch.tensor(state_vector_1, device=device, dtype=torch.float)
            map_1 = torch.tensor(map_1, device=device, dtype=torch.float)

            state_vector_2 = dronesEnv.drones[1].state
            map_2 = dronesEnv.drones[1].observation
            state_vector_2 = torch.tensor(state_vector_2, device=device, dtype=torch.float)
            map_2 = torch.tensor(map_2, device=device, dtype=torch.float)


            for i in range(int(DT/DTI)):

                episode_length += 1
                action1, log_probability_1, hidden_1 = get_action(map_1, state_vector_1, hidden_1)
                action2, log_probability_2, hidden_2 = get_action(map_2, state_vector_2, hidden_2)
                reward_1, reward_2 = dronesEnv.step([action1.item(), action2.item()], observation)

                next_state_vector_1 = dronesEnv.drones[0].state
                next_map_1 = dronesEnv.drones[0].observation

                next_state_vector_1 = torch.tensor(next_state_vector_1, device=device, dtype=torch.float)
                next_map_1 = torch.tensor(next_map_1, device=device, dtype=torch.float)

                next_state_vector_2 = dronesEnv.drones[1].state
                next_map_2 = dronesEnv.drones[1].observation

                next_state_vector_2 = torch.tensor(next_state_vector_2, device=device, dtype=torch.float)
                next_map_2 = torch.tensor(next_map_2, device=device, dtype=torch.float)

                reward_1 = torch.tensor([reward_1], device=device)
                reward_2 = torch.tensor([reward_2], device=device)  

                episode_transitions_1.push(map_1, state_vector_1, action1, log_probability_1, reward_1)
                episode_transitions_2.push(map_2, state_vector_2, action2, log_probability_2, reward_2)

                state_vector_1 = next_state_vector_1
                state_vector_2 = next_state_vector_2

                map_1 = next_map_1
                map_2 = next_map_2

            if not fireEnv.fire_in_range(6):
                hidden_1 = None
                hidden_2 = None
                memory_buffer.push(episode_transitions_1)
                memory_buffer.push(episode_transitions_2)
                episode_transitions_1 = EpisodeTransitions()
                episode_transitions_2 = EpisodeTransitions()
                episode_i += 1
                observation = fireEnv.reset()
                dronesEnv.reset(observation)
            
    return memory_buffer



In [8]:
def evaluate(belief_maps, state_vectors, actions):
    action_probs, _ = actor(belief_maps, state_vectors)
    dist = Categorical(action_probs)
    action_logprobs = dist.log_prob(actions)
    dist_entropy = dist.entropy()
    state_values, _ = critic(belief_maps, state_vectors)
    
    return action_logprobs, state_values, dist_entropy

In [9]:
def learn(total_timesteps):
    t_so_far = 0 # Timesteps simulated so far
    i_so_far = 0 # Iterations ran so far

    while t_so_far < total_timesteps:  
        memory_buffer = rollout()
        batches = memory_buffer.get_batches

        for batch in batches[0:1]:

            t_so_far += BATCH_SIZE
            i_so_far += 1

            batch = Transition(*zip(*batch))

            belief_map_batch = torch.cat(batch.belief_map)
            state_vector_batch = torch.cat(batch.state_vector)
            action_batch = torch.cat(batch.action)
            log_probs_batch = torch.cat(batch.log_probability)
            
            reward_to_go = compute_reward_to_go(batch.reward)
            reward_to_go = (reward_to_go - reward_to_go.mean()) / (reward_to_go.std() + 1e-7)
            
            
            for _ in range(5):   
                
                
                logprobs, state_values, dist_entropy = evaluate(belief_map_batch , state_vector_batch, action_batch)
                state_values = torch.squeeze(state_values)
                ratios = torch.exp(logprobs - log_probs_batch)
                advantages = reward_to_go - state_values.detach() 

                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1-CLIP, 1+CLIP) * advantages

                actor_loss = (-torch.min(surr1, surr2)).mean()
                critic_loss = nn.MSELoss()(state_values, reward_to_go)
      
				# Calculate gradients and perform backward propagation for actor network
                optimizer_actor.zero_grad()
                actor_loss.backward(retain_graph=True)
                optimizer_actor.step()

				# Calculate gradients and perform backward propagation for critic network
                optimizer_critic.zero_grad()
                critic_loss.backward()
                optimizer_critic.step()


                print("one step")

            print(f'Timesteps: {t_so_far}')
            if i_so_far % SAVE_FREQ == 0:
                torch.save(actor.state_dict(), './ppo_actor.pth')
                torch.save(critic.state_dict(), './ppo_critic.pth')

        

In [10]:
learn(200_000_000)

one step


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [200, 2]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).