### Imports

In [18]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque
import random
import math
from itertools import count

### Hyperparameters

In [21]:
# DQL constants
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.95
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE_EVERY = 10
LEARNING_RATE = 5e-4
NUM_EPISODES = 20

# Environment constants
ENV_SIZE = 32
MAX_ENV_STEPS = 50
POP_DENSITY = 0.2
ZOMBIE_FRACTION = 0.3
VISIBILITY = 4 # square's half side length
STATE_SIZE = (2 * VISIBILITY + 1)**2
ACTION_SIZE = 5 # 4 directions + do nothing

### Initializations

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Function definitions

### Environment

In [30]:
# TODO: code the environment
class Environment():
    DEATH_REWARD = -100
    KILL_REWARD = 50
    REST_REWARD = 0
    MOVE_REWARD = -5
    
    EMPTY_CELL = 0
    HUMAN_CELL = 1
    AGENT_CELL = 2
    ZOMBIE_CELL = 3
    
    def __init__(self, state_size):
        self.state_size = state_size
    
    def reset(self):
        ## generate grid
        self.grid = np.zeros((ENV_SIZE, ENV_SIZE))
        
        ## place people and zombies
        rands = [random.uniform(0, 1) for i in range(ENV_SIZE**2)]
        r = random.randint(0, ENV_SIZE**2-1)
        for y in range(ENV_SIZE):
            for x in range(ENV_SIZE):
                if rands[y * ENV_SIZE + x] < POP_DENSITY:
                    if rands[y * ENV_SIZE + x] < POP_DENSITY * ZOMBIE_FRACTION:
                        grid[y, x] = ZOMBIE_CELL
                    else:
                        grid[y, x] = HUMAN_CELL
                    
        # place agent
        self.agent_x = random.randint(0, ENV_SIZE-1) # (second bound is included)
        self.agent_y = random.randint(0, ENV_SIZE-1)
        while grid[self.agent_y, self.agent_x] != EMPTY_CELL:
            self.agent_x = random.randint(0, ENV_SIZE-1)
            self.agent_y = random.randint(0, ENV_SIZE-1)
        grid[self.agent_y, self.agent_x] = AGENT_CELL
        
        self.episode_step = 0
        state, _, _ = scan_env(self.agent_pos)
        return state
    
    def step(self, action):
        self.episode_step += 1
        self.perform_action(action)
        
        ## TODO - Later on: make other agents move as well
        
        self.move_zombies()
        
        state, nb_allies, enemies = scan_env(self.agent_pos)
        
        dead = False
        reward = REST_REWARD if action == 0 else MOVE_REWARD
        
        # TODO: do this same thing but for all other interactions
        if len(enemies) > 0: # Contact
            for enemy in range(enemies): #Might want to aggregate this by simply modifying the 'b' value instead (b = 1 + nb_enemies)
                r = np.random.uniform(0, 1)
                b = 1
                k = 1 + nb_allies # << S # maybe add some multiplicative factor to put more importance on having allies
                if r >= b/(b+k): # kill
                    reward += KILL_REWARD
                    self.grid[enemy] = EMPTY_CELL # TODO make sure this works correctly (do not invert y and x coos..)
                else: # death
                    reward += DEATH_REWARD
                    dead = True
                    ## Probably don't need to update state as it won't be used in replay memory anyway
                    break
        
        done = self.episode_step >= MAX_ENV_STEPS or dead
        
        return state, reward, done, None
    
    def scan_env(self, agent_pos): #Scan the vicinity to record allies and enemies (this is our new state)
        # TODO
        state = np.array([])
        nb_allies = 0
        nb_enemies = 0
        return state, nb_allies, nb_enemies
    
    def render(self):
        pass
    
    def perform_action(self, action):
        x = self.agent_x
        y = self.agent_y
        if action == 1: #move right
            self.agent_x = (self.agent_x + 1) % ENV_SIZE
        elif action == 2: #move left
            self.agent_x = (self.agent_x - 1) % ENV_SIZE
        elif action == 3: #move top
            self.agent_y = (self.agent_y + 1) % ENV_SIZE
        elif action == 4: #move down
            self.agent_y = (self.agent_y - 1) % ENV_SIZE
        # Might want to implement collision avoidance, but this might cause problems if 2 agents want to move
        # together in the same direction.
        if grid[self.agent_y, self.agent_x] != 0: # If cell not empty
            self.agent_x = x
            self.agetn_y = y
        else:
            grid[y, x] = EMPTY_CELL
            grid[self.agent_y, self.agent_x] = HUMAN_CELL
    
    def move_zombies(self):
        pass

### Replay memory

In [10]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        
    def push(self, *args):
        self.memory.append(Transition(*args))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

### DQN

In [14]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.seed = torch.manual_seed(0)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 32)
        self.head = nn.Linear(32, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # return self.head(x)
        return self.head(x.view(x.size(0), -1))

### DQL Agent

In [28]:
class DQLAgent():
    def __init__(self, state_size, action_size, env):
        self.state_size = state_size
        self.action_size = action_size
        self.env = env
        
        # DQNs
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
        self.memory = ReplayMemory(BUFFER_SIZE)
        
        self.time_step = 0
        
    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        
        state_action_values = policy_net(state_batch).gather(1, action_batch)
        
        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        expected_state_action_values = next_state_values * GAMMA + reward_batch
        
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
        
        self.optimizer.zero_grad()
        loss.backward()
        for param in policy_net.parameters():
            param.grad.data.clamp_(-1, 1) # Gradient clipping?
        optimizer.step()
        
    def select_action(self, state):
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.time_step / EPS_DECAY)
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long)
    
    def train(self):
        for episode in range(NUM_EPISODES):
            state = self.env.reset() # get initial state
            for t in count(): # The environment is responsible for returning done=True after some time steps
                action = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action.item())
                reward = torch.tensor([reward], device=device)
                
                self.memory.push(state, action, next_state, reward)
                state = next_state
                
                self.optimize_model()
                if done:
                    # TODO: Plot some statistics etc...
                    break
            if episode % TARGET_UPDATE_EVERY == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
        print("Training finished")
        

# Run simulation

In [29]:
env = Environment(STATE_SIZE)
dqlAgent = DQLAgent(STATE_SIZE, ACTION_SIZE, env)
#dqlAgent.train()