In [1]:
!pip install svgpath2mpl

[0m

In [2]:
from collections import namedtuple, deque
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adamax
import random
from svgpath2mpl import parse_path
import matplotlib.pyplot as plt
import matplotlib
from scipy.ndimage import rotate, shift
from matplotlib.animation import FuncAnimation
from probabilistic_fire_env import ProbabilisticFireEnv
from drone_env import DronesEnv
from replay_memory import Transition
from models.drqn import DRQN

In [3]:
from collections import deque
import random
import numpy as np
import torch
from transition import Transition

class EpisodeBuffer:


  def __init__(self, capacity=1000):
    self.capicity = capacity
    self.episodes = deque([],maxlen=self.capicity)

  def push(self, episode):
    self.episodes.push(episode)

  def sample(self, batch_size):
    episode_batch = random.sample(self.episodes, batch_size)
    min_episode_length = min([len(episode) for episode in episode_batch])

    for i in range(batch_size):
      episode_batch[i] = episode_batch[i][0:min_episode_length]

    return Transition(*zip(*episode_batch)), min_episode_length

  def __len__(self):
      return len(self.memory)


class EpisodeMemory:

  def __init__(self):
    self.memory = []

  def push(self, *args):
    self.memory.append(Transition(*args))

  def __getitem__(self, index):
    return self.memory[index]
    
  def __len__(self):
    return len(self.memory)

In [4]:
height = width = 100
BATCH_SIZE = 5
GAMMA = 0.99
INIT_SIZE = 5
TARGET_UPDATE = 1000
SAVE_POLICY = 100
EPISODE_LENGTH = 250
TRAIN_FREQ  = 10   # Number of samples to generate between trainings (Should be multiple of 10)
PRINT_FREQ  = 100  # Frequency of printing (Should be a multiple of 10)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_actions = 2
screen_height = screen_width = 100
channels = 2
policy_net = DRQN(device, channels, screen_height, screen_width, n_actions).to(device)
target_net = DRQN(device, channels, screen_height, screen_width, n_actions).to(device)
steps = 0
policy_file_path = f'./policy_weights.pt'
target_file_path = f'./target_weights.pt'

#policy_net.load_state_dict(torch.load(policy_file_path))
#target_net.load_state_dict(torch.load('target_weights.pt'))
target_net.load_state_dict(policy_net.state_dict())
episode_buffer = EpisodeBuffer()

#policy_net.train()
#target_net.eval()
update_counter = 0
optimizer = Adamax(policy_net.parameters(), lr=0.0001)

In [6]:
def optimize_model():
    
    global update_counter
    update_counter += 1

    batch = episode_buffer.sample(BATCH_SIZE)

    next_states = torch.cat(batch.next_state_vector)
    next_belief_map = torch.cat(batch.next_belief_map)

    belief_map_batch = torch.cat(batch.belief_map)
    state_vector_batch = torch.cat(batch.state_vector)
    
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(belief_map_batch, state_vector_batch).gather(1, action_batch)
    next_state_values = target_net(next_belief_map, next_states).max(1)[0].detach()

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss().to(device)
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    if update_counter % TARGET_UPDATE == 0:
        policy_file_path = f'./policy_weights2.pt'
        target_file_path = f'./target_weights2.pt'
        torch.save(policy_net.state_dict(), policy_file_path)
        torch.save(target_net.state_dict(), target_file_path)
        print('update target')
        target_net.load_state_dict(policy_net.state_dict())

    return loss

In [7]:
DT          = 0.5  # Time between wildfire updates            
DTI         = 0.1  # Time between aircraft decisions
fireEnv = ProbabilisticFireEnv(height, width)
dronesEnv = DronesEnv(height, width, DT, DTI) 
loss = None
i_episode = 1

observation = fireEnv.reset()
dronesEnv.reset(observation)

episode_memory_1 = EpisodeMemory()
episode_memory_2 = EpisodeMemory()

hidden_1 = None
hidden_2 = None

while True:
  # Initialize the environment and state
  #env.reset()
  for j in range(TRAIN_FREQ//int(2*DT/DTI)):

    observation = fireEnv.step()

    state_vector_1 = dronesEnv.drones[0].state
    state_vector_1_tensor = torch.tensor(state_vector_1, device=device, dtype=torch.float)

    belief_map_1 = dronesEnv.drones[0].observation
    belief_map_1_tensor = torch.tensor(belief_map_1, device=device, dtype=torch.float)
    
    state_vector_2 = dronesEnv.drones[1].state
    state_vector_2_tensor = torch.tensor(state_vector_2, device=device, dtype=torch.float)

    belief_map_2 = dronesEnv.drones[1].observation
    belief_map_2_tensor = torch.tensor(belief_map_2, device=device, dtype=torch.float)

    for i in range(int(DT/DTI)):

      action1, hidden_1 = policy_net.select_action(belief_map_1_tensor, state_vector_1_tensor, steps, hidden_1)
      action2, hidden_2 = policy_net.select_action(belief_map_2_tensor, state_vector_2_tensor, steps, hidden_2)

      steps += 2
      reward_1, reward_2 = dronesEnv.step([action1.item(), action2.item()], observation)

      next_state_vector_1 = dronesEnv.drones[0].state
      next_state_vector_1 = torch.tensor(next_state_vector_1, device=device, dtype=torch.float)

      next_map_1 = dronesEnv.drones[0].observation
      next_map_1 = torch.tensor(next_map_1, device=device, dtype=torch.float)

      next_state_vector_2 = dronesEnv.drones[1].state
      next_state_vector_2 = torch.tensor(next_state_vector_2, device=device, dtype=torch.float)

      next_map_2 = dronesEnv.drones[1].observation
      next_map_2 = torch.tensor(next_map_2, device=device, dtype=torch.float)

      reward_1 = torch.tensor([reward_1], device=device, dtype=torch.float)
      reward_2 = torch.tensor([reward_2], device=device, dtype=torch.float)  

      episode_memory_1.push(belief_map_1, state_vector_1, action1, next_map_1, next_state_vector_1, reward_1)
      episode_memory_2.push(belief_map_2, state_vector_2, action2, next_map_2, next_state_vector_2, reward_2)

      state_vector_1 = next_state_vector_1
      state_vector_2 = next_state_vector_2

      state_vector_1_tensor = torch.tensor(next_state_vector_1 , device=device, dtype=torch.float)
      state_vector_2_tensor = torch.tensor(next_state_vector_2 , device=device, dtype=torch.float)

      belief_map_1 = next_map_1
      belief_map_2 = next_map_2


      belief_map_1_tensor = torch.tensor(next_map_1, device=device, dtype=torch.float)
      belief_map_2_tensor = torch.tensor(next_map_2, device=device, dtype=torch.float)

    if not fireEnv.fire_in_range(6):
      observation = fireEnv.reset()
      dronesEnv.reset(observation)
      episode_buffer.put(episode_memory_1)
      episode_buffer.put(episode_memory_2)
      episode_buffer_1 = EpisodeMemory()
      episode_buffer_2 = EpisodeMemory()
      hidden_1 = None
      hidden_2 = None
      i_episode +=1
      
      if (i_episode+1) % 5 == 0:
        print(f'{i_episode+1} episodes completed')
        print(f'loss {loss}')
        print(f'steps done {steps}')
      

  if i_episode>=INIT_SIZE:
    loss = optimize_model()

NameError: name 'hidden_1' is not defined