In [1]:
!pip install svgpath2mpl

Collecting svgpath2mpl
  Downloading svgpath2mpl-1.0.0-py2.py3-none-any.whl (7.8 kB)
Installing collected packages: svgpath2mpl
Successfully installed svgpath2mpl-1.0.0
[0m

In [2]:
from collections import namedtuple, deque
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adamax
import random
from svgpath2mpl import parse_path
import matplotlib.pyplot as plt
import matplotlib
from scipy.ndimage import rotate, shift
from matplotlib.animation import FuncAnimation
from probabilistic_fire_env import ProbabilisticFireEnv
from drone_env import DronesEnv
from replay_memory import Transition
from models.drqn import DRQN

In [3]:
from collections import deque
import random
import numpy as np
import torch
from transition import Transition

class EpisodeBuffer:


  def __init__(self, capacity=100):
    self.capicity = capacity
    self.episodes = deque([],maxlen=self.capicity)

  def push(self, episode):
    self.episodes.append(episode)

  def sample(self, batch_size, episode_start, episode_end):
    

    episode_batch =  random.sample(self.episodes, batch_size)
    episode_end   =  min(episode_end, len(episode_batch[0])) 

    return episode_batch[0][episode_start:episode_end], episode_end-episode_start

  def __len__(self):
      return len(self.memory)


class EpisodeMemory:

  def __init__(self):
    self.memory = []

  def push(self, *args):
    self.memory.append(Transition(*args))

  def __getitem__(self, index):
    return self.memory[index]
    
  def __len__(self):
    return len(self.memory)

In [4]:
height = width = 100
BATCH_SIZE = 1
GAMMA = 0.99
INIT_SIZE = 3
TARGET_UPDATE = 1000
SAVE_POLICY = 100
EPISODE_LENGTH = 250
TRAIN_FREQ  = 10   # Number of samples to generate between trainings (Should be multiple of 10)
PRINT_FREQ  = 100  # Frequency of printing (Should be a multiple of 10)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_actions = 2
screen_height = screen_width = 100
channels = 2
policy_net = DRQN(device, channels, screen_height, screen_width, n_actions).to(device)
target_net = DRQN(device, channels, screen_height, screen_width, n_actions).to(device)
steps = 0


#policy_net.load_state_dict(torch.load(policy_file_path))
#target_net.load_state_dict(torch.load('target_weights.pt'))
target_net.load_state_dict(policy_net.state_dict())
episode_buffer = EpisodeBuffer()

#policy_net.train()
#target_net.eval()
update_counter = 0
optimizer = Adamax(policy_net.parameters(), lr=0.0001)

In [6]:
def optimize_model():
    
    global update_counter
    update_counter += 1

    episode_start = random.randint(0, 50)
    episode_end  =  episode_start+64

    transitions, epiosde_length = episode_buffer.sample(BATCH_SIZE, episode_start, episode_end)
    batch = Transition(*zip(*transitions))
    
    next_states = torch.cat([torch.tensor(next_state_vector, dtype=torch.float).to(device) for next_state_vector in batch.next_state_vector])
    next_belief_map = torch.cat([torch.tensor(next_belief_map, dtype=torch.float).to(device) for next_belief_map in batch.next_belief_map])

    belief_map_batch = torch.cat([torch.tensor(belief_map, dtype=torch.float).to(device) for belief_map in batch.belief_map])
    state_vector_batch  = torch.cat([torch.tensor(state_vector, dtype=torch.float).to(device) for state_vector in batch.state_vector])
    
    action_batch  = torch.stack([torch.tensor([action], dtype=torch.long).to(device) for action in batch.action], axis=0)

    reward_batch  = torch.stack([torch.tensor(reward, dtype=torch.long).to(device) for reward in batch.reward], axis=0)

    hidden_policy = policy_net.init_hidden_state(epiosde_length, training=True)
    hidden_target = target_net.init_hidden_state(epiosde_length, training=True)

    policy_output, _ = policy_net(belief_map_batch, state_vector_batch, hidden_policy)
    target_output, _ = target_net(next_belief_map, next_states, hidden_target)

    state_action_values = policy_output.gather(1, action_batch)
    next_state_values = target_output.max(1)[0].detach()

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss().to(device)
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    if update_counter % TARGET_UPDATE == 0:
        policy_file_path = f'./policy_weights_drqn.pt'
        target_file_path = f'./target_weights_drqn.pt'
        torch.save(policy_net.state_dict(), policy_file_path)
        torch.save(target_net.state_dict(), target_file_path)
        print('update target')
        target_net.load_state_dict(policy_net.state_dict())

    return loss

In [7]:
DT          = 0.5  # Time between wildfire updates            
DTI         = 0.1  # Time between aircraft decisions
fireEnv = ProbabilisticFireEnv(height, width)
dronesEnv = DronesEnv(height, width, DT, DTI) 
loss = None
i_episode = 1

observation = fireEnv.reset()
dronesEnv.reset(observation)

episode_memory_1 = EpisodeMemory()
episode_memory_2 = EpisodeMemory()

hidden_1 = policy_net.init_hidden_state()
hidden_2 = policy_net.init_hidden_state()

while True:
  # Initialize the environment and state
  #env.reset()
  for j in range(TRAIN_FREQ//int(2*DT/DTI)):

    observation = fireEnv.step()

    state_vector_1 = dronesEnv.drones[0].state
    map_1 = dronesEnv.drones[0].observation

    state_vector_2 = dronesEnv.drones[1].state
    map_2 = dronesEnv.drones[1].observation


    for i in range(int(DT/DTI)):

      state_vector_1_tensor = torch.tensor(state_vector_1, device=device, dtype=torch.float)
      map_1_tensor = torch.tensor(map_1, device=device, dtype=torch.float)

      state_vector_2_tensor = torch.tensor(state_vector_2, device=device, dtype=torch.float)
      map_2_tensor = torch.tensor(map_2, device=device, dtype=torch.float)

      action1, hidden_1 = policy_net.select_action(map_1_tensor, state_vector_1_tensor, steps, hidden_1)
      action2, hidden_2 = policy_net.select_action(map_2_tensor, state_vector_2_tensor, steps, hidden_2)

      steps += 2
      reward_1, reward_2 = dronesEnv.step([action1.item(), action2.item()], observation)

      next_state_vector_1 = dronesEnv.drones[0].state
      next_map_1 = dronesEnv.drones[0].observation

      next_state_vector_2 = dronesEnv.drones[1].state
      next_map_2 = dronesEnv.drones[1].observation


      episode_memory_1.push(map_1, state_vector_1, action1.item(), next_map_1, next_state_vector_1, reward_1)
      episode_memory_2.push(map_2, state_vector_2, action2.item(), next_map_2, next_state_vector_2, reward_2)

      state_vector_1 = next_state_vector_1
      state_vector_2 = next_state_vector_2

      map_1 = next_map_1
      map_2 = next_map_2


    if not fireEnv.fire_in_range(6):
      observation = fireEnv.reset()
      dronesEnv.reset(observation)
      episode_buffer.push(episode_memory_1)
      episode_buffer.push(episode_memory_2)
      episode_buffer_1 = EpisodeMemory()
      episode_buffer_2 = EpisodeMemory()
      hidden_1 = policy_net.init_hidden_state()
      hidden_2 = policy_net.init_hidden_state()
      i_episode +=1
      
      if (i_episode+1) % 5 == 0:
        print(f'{i_episode+1} episodes completed')
        print(f'loss {loss}')
        print(f'steps done {steps}')
      

      if i_episode>=INIT_SIZE:
        loss = optimize_model()

5 episodes completed
loss 0.024106470867991447
steps done 2220
10 episodes completed
loss 3.270687011536211e-05
steps done 6870
15 episodes completed
loss 5.947919362370158e-06
steps done 11550
20 episodes completed
loss 7.611399269080721e-06
steps done 14850
25 episodes completed
loss 2.9829629966116045e-06
steps done 18990
30 episodes completed
loss 6.436221156036481e-06
steps done 22400
35 episodes completed
loss 4.356203589850338e-06
steps done 26560
40 episodes completed
loss 6.3293382481788285e-06
steps done 29920
45 episodes completed
loss 5.882850473426515e-07
steps done 33810
50 episodes completed
loss 3.943794126826106e-06
steps done 37390
55 episodes completed
loss 2.7055327791458694e-06
steps done 40400
60 episodes completed
loss 3.072225922551297e-07
steps done 43720
65 episodes completed
loss 6.672458994216868e-07
steps done 47030
70 episodes completed
loss 2.0482759737205924e-06
steps done 50960
75 episodes completed
loss 2.0478682927205227e-06
steps done 54170
80 episod

: 

: 