https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/config.py


In [8]:
import numpy as np
np.random.seed(14)
import math, random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
import copy
import os,sys
from torch.utils import tensorboard as tb

In [9]:
hidden_dim = 64
max_step = 5000 #originally 500
GAMMA = 0.99
n_episode = 1000 #originally 800
i_episode = 0
buffer_size = 65000 #change back to 65000
batch_size = 64 #change back to 64
n_epoch = 100 #orginally 25
epsilon = 0.7 #originally 0.9
score = 0
tau = 0.98

GRID_DIM = 50 # TODO: Tune this
NUM_TASKS = 2 # TODO: Tune this
ADJ_THRESHOLD = GRID_DIM / 4 # TODO: Tune this
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

def is_legal(x,y):
    return (x>=0)&(x<GRID_DIM)&(y>=0)&(y<=GRID_DIM)

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/buffer.py

In [10]:
class ReplayBufferGCare(object):
    """
    Replay buffer for storing the agent's experiences
    """

    def __init__(self, buffer_size, obs_space, n_action, n_tasks):
        """
        Initialize the replay buffer
        
        Params:
        buffer_size:
        obs_space:
        n_action:
        n_tasks:
        """
        self.buffer_size = buffer_size
        self.n_tasks = n_tasks
        self.pointer = 0
        self.len = 0
        self.actions = np.zeros((self.buffer_size,1),dtype = np.int32)
        self.rewards = np.zeros((self.buffer_size, 1))
        self.dones = np.zeros((self.buffer_size,1))
        self.obs = np.zeros((self.buffer_size,n_tasks,obs_space))
        self.next_obs = np.zeros((self.buffer_size,n_tasks,obs_space))
        self.matrix = np.zeros((self.buffer_size,self.n_tasks,self.n_tasks))
        self.next_matrix = np.zeros((self.buffer_size,self.n_tasks,self.n_tasks))

    def getBatch(self, batch_size):
        """
        Sample a batch of random entries from the replay buffer
        
        Params:
        batch_size:
        
        Returns:
        obs:
        action:
        reward
        next_obs:
        matrix:
        next_matrix:
        done:
        """
        index = np.random.choice(self.len, batch_size, replace=False)
        return self.obs[index], self.actions[index], self.rewards[index], self.next_obs[index], self.matrix[index], self.next_matrix[index], self.dones[index]

    def add(self, obs, action, reward, next_obs, matrix, next_matrix, done):
        """
        Add to the replay buffer
        
        Params:
        obs:
        action:
        reward:
        next_obs:
        matrix:
        next_matrix:
        done:
        """
        self.obs[self.pointer] = obs
        self.actions[self.pointer] = action
        self.rewards[self.pointer] = reward
        self.next_obs[self.pointer] = next_obs
        self.matrix[self.pointer] = matrix
        self.next_matrix[self.pointer] = next_matrix
        self.dones[self.pointer] = done
        self.pointer = (self.pointer + 1)%self.buffer_size
        self.len = min(self.len + 1, self.buffer_size)

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/model.py

In [11]:
class MTRL_ATT(nn.Module):
    """
    """
    def __init__(self, din):
        super(MTRL_ATT, self).__init__()
        self.fc1 = nn.Linear(din, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        y = F.relu(self.fc1(x))
        y = F.relu(self.fc2(y))
        y = F.sigmoid(self.fc3(y))
        return y

class MTRL_Encoder(nn.Module): # TODO: Need to make it a CNN for higher dim obs space like MetaWorld
    """
    """
    def __init__(self, din=32, hidden_dim=128):
        super(MTRL_Encoder, self).__init__()
        self.fc = nn.Linear(din, hidden_dim)


    def forward(self, x):
        embedding = F.tanh(self.fc(x))
        return embedding

class MTRL_AttModel(nn.Module):
    """
    """
    def __init__(self, n_node, din, hidden_dim, dout):
        super(MTRL_AttModel, self).__init__()
        self.fcv = nn.Linear(din, hidden_dim)
        self.fck = nn.Linear(din, hidden_dim)
        self.fcq = nn.Linear(din, hidden_dim)
        self.fcout = nn.Linear(hidden_dim, dout)

    def forward(self, x, mask):
        v = F.tanh(self.fcv(x))
        q = F.tanh(self.fcq(x))
        k = F.tanh(self.fck(x)).permute(0,2,1)
        att = F.softmax(torch.mul(torch.bmm(q,k), mask) - 9e15*(1 - mask),dim=2)
        # Note: Order of applying adj matrix is different than that in paper. Don't get confused!
        out = torch.bmm(att,v)
        return out

class MTRL_Q_Net(nn.Module):
    """
    """
    def __init__(self, hidden_dim, dout):
        super(MTRL_Q_Net, self).__init__()
        # NOTE: This is now modified to have both h vectors from both of the attention layers
        # concatenated - originally it was only getting the h vector of the last layer
        # so the input dim of the linear layer was hidden_dim
        self.fc = nn.Linear(hidden_dim*2, dout)

    def forward(self, x):
        q = F.relu(self.fc(x))
        return q

    
class MTRL_DGN(nn.Module):
    """
    """
    def __init__(self,n_tasks,num_inputs,hidden_dim,num_actions):
        super(MTRL_DGN, self).__init__()

        self.encoder = MTRL_Encoder(num_inputs,hidden_dim)
        # TODO: Try both single encoder and mix of encoder settings
        # Will remain same for MTRL
        self.att_1 = MTRL_AttModel(n_tasks,hidden_dim,hidden_dim,hidden_dim)
        self.att_2 = MTRL_AttModel(n_tasks,hidden_dim,hidden_dim,hidden_dim)
        self.q_net = MTRL_Q_Net(hidden_dim,num_actions)
        # Q Net remains same for MTRL

    def forward(self, x, mask):
        h1 = self.encoder(x)
        h2 = self.att_1(h1, mask)
        h3 = self.att_2(h2, mask) 
        # TODO: try concatentation for MTRL
        
        h4 = torch.cat((h2,h3),dim=2)
        q = self.q_net(h4)
        # Note: No concatenation done. Output of last attention head used directly
        # Note: 2 attention heads used
        return q 

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/surviving.py

In [12]:
class GridWorldWithCare(object):
    
    def __init__(self, n_tasks):
        """
        Initialize the gridworld
        
        Params:
        n_tasks:
        """
        super(GridWorldWithCare, self).__init__()
        self.n_action = 4
        self.n_tasks = n_tasks
        # TODO: maybe include food as part of task, reach dest with > 0 food or something
        self.tasks = [0]*self.n_tasks
        self.agent = [-1, -1]
        self.build_env()

        self.dones = np.zeros(self.n_tasks) # Array to indicate whether each task is done or not -- used to calculate rewards
        self.steps = 0
        self.len_obs = (self.n_tasks+1)*2

    def reset(self):
        """
        Reset the gridworld
        
        Returns:
        obs:
        adj:
        """

        self.build_env()
        self.dones = np.zeros(self.n_tasks)
        self.steps = 0
        return self.get_obs(), self.get_adj()

    def build_env(self):
        """
        Build the gridworld
        """
        for i in range(self.n_tasks):
            x = np.random.randint(0, GRID_DIM)
            y = np.random.randint(0, GRID_DIM)
            self.tasks[i] = [x, y]
            print("TASK NUMBER ", i, " DEST: ", x, y)
        self.agent[0] = np.random.randint(0, GRID_DIM)
        self.agent[1] = np.random.randint(0, GRID_DIM)

    def get_obs(self):
        """
        Get observations
        
        Returns:
        obs:
        """
        # TODO: change this for MTRL 
        obs = []
        
        x_agent = self.agent[0]
        y_agent = self.agent[1]

        obs.append(x_agent/GRID_DIM)
        obs.append(y_agent/GRID_DIM)

        # 		for i in range(-1,2):
        # 			for j in range(-1,2):
        # 				obs.append(self.maze[x_agent+i][y_agent+j])

        for i in range(self.n_tasks):
            obs.append((self.tasks[i][0]-x_agent)/GRID_DIM)
            obs.append((self.tasks[i][1]-y_agent)/GRID_DIM)

        # TODO: 1. if we include maze state or not, and if we do, we would need to figure out
        # how to effectively send that along with task destinations
        
        #Idea: use distance between agent and task as obs
        
        return obs

    def get_adj(self): # TODO: Change this to use task description encoding. 
        # In this case task description is the location of the destination.
        """
        Get adjacency matrix
        
        Returns:
        adj:
        """
        adj = np.zeros((self.n_tasks, self.n_tasks))

        # Calculate adjacency regarding to the distances of the tasks respect to the agent
        x_agent, y_agent = self.agent[0], self.agent[1]

        # HARD ATTENTION
        # Traverse through the tasks and calculate the Euclidean distance between them and the agent
#         for i in range(self.n_tasks):
#             x_task_i, y_task_i = self.tasks[i][0] - x_agent, self.tasks[i][1] - y_agent
#             for j in range(self.n_tasks):
#                 x_task_j, y_task_j = self.tasks[j][0] - x_agent, self.tasks[j][1] - y_agent
#                 task_dist = math.sqrt((x_task_j - x_task_i)**2 + (y_task_i - y_task_j)**2)
#                 if task_dist <= ADJ_THRESHOLD:
#                     adj[i,j] = 1
#                     adj[j,i] = 1
                    
        # SOFT ATTENTION
#         adj = np.ones((self.n_tasks, self.n_tasks)) # NOTE: 
        for i in range(self.n_tasks):
            x_task_i, y_task_i = self.tasks[i][0]-x_agent, self.tasks[i][1]-y_agent
            for j in range(self.n_tasks):
                x_task_j, y_task_j = self.tasks[j][0]-x_agent, self.tasks[j][1]-y_agent
                # Instead of having 1 or 0s, have their vectoral positions according to each other
                task_dist = math.sqrt((x_task_j - x_task_i)**2 + (y_task_j - y_task_i)**2)
                
#                 print('x_task_i: {}, y_task_i: {}, x_task_j: {}, y_task_j: {}, task_dist: {}'.format(
#                         x_task_i, y_task_i, x_task_j, y_task_j, task_dist
#                 ))
                
                # Set this distance / GRID_DIM
                adj[i,j] = 1 - float(task_dist)/GRID_DIM # Extract from 1 bc the closer the better
                adj[j,i] = 1 - float(task_dist)/GRID_DIM
                
        
                
#         print("ADJACENCY: {}".format(adj))

#         print('x_agent: {}, y_agent: {}'.format(x_agent, y_agent))

        return adj



    def step(self, action):
        """
        Take one step in the gridworld according to the given actions
        
        Params:
        action:
        
        Returns:
        obs:
        adj:
        reward:
        all_tasks_done:
        """

        # There are 4 different actions for the agent
        # If there is any place to go in the maze then the agent will go 
        # 0: Move up, 1: Move down, 2: Move left, 3: Move right

        self.steps += 1
        x_agent, y_agent = self.agent[0], self.agent[1]
#         print("AGENT LOCATION: ", agent_x, agent_y)
#         print("ACTION: ", action)
        if action == 0: # Move up (decrease x by one)
            if is_legal(x_agent-1, y_agent):
                # Change the agent and the maze
                self.agent[0] -= 1

        elif action == 1: # Move down (increase x by one)
            if is_legal(x_agent+1, y_agent):
                # Change the agent and the maze
                self.agent[0] += 1

        elif action == 2: # Move left (decrease y by one)
            if is_legal(x_agent, y_agent-1):
                # Change the agent and the maze
                self.agent[1] -= 1

        elif action == 3: # Move right (increase y by one)
            if is_legal(x_agent, y_agent+1):
                # Change the agent and the maze
                self.agent[1] += 1
                
        # Calculate the rewards for each task
        rewards = [0] * self.n_tasks
        total_reward = 0

        # Check if you reached to any destinations here
        new_agent_x, new_agent_y = self.agent[0], self.agent[1]
        for i in range(self.n_tasks):
            if self.tasks[i][0] == new_agent_x and self.tasks[i][1] == new_agent_y:
                if self.dones[i] == 0:
                    self.dones[i] = 1
                    rewards[i] = 1
                    total_reward += 1
                    print("Task ", i, " completed at step ", self.steps)
            else:
                total_reward += 1.0/float((math.sqrt((self.tasks[i][0]-new_agent_x)**2 + (self.tasks[i][1]-new_agent_y)**2)))
                

        # Only if all the tasks are done, then the episode is done
        all_tasks_done = not (0 in self.dones)



        return self.get_obs(), self.get_adj(), total_reward, all_tasks_done

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/main.py

In [None]:
env = GridWorldWithCare(NUM_TASKS)
observation_space = env.len_obs
n_actions = env.n_action
n_tasks = env.n_tasks

buff = ReplayBufferGCare(buffer_size,observation_space,n_actions,n_tasks)
model = MTRL_DGN(n_tasks,observation_space,hidden_dim,n_actions)
model_tar = MTRL_DGN(n_tasks,observation_space,hidden_dim,n_actions)
model = model.cuda()
model_tar = model_tar.cuda()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
# att = MTRL_ATT(observation_space).cuda()
# att_tar = MTRL_ATT(observation_space).cuda()
# att_tar.load_state_dict(att.state_dict())
# optimizer_att = optim.Adam(att.parameters(), lr = 0.001)
criterion = nn.BCELoss()

M_Null = torch.Tensor(np.array([np.eye(n_tasks)]*batch_size)).cuda()
M_ZERO = torch.Tensor(np.zeros((batch_size,n_tasks,n_tasks))).cuda()
# threshold = float(sys.argv[1]) TODO: figure this out
# f = open(sys.argv[1]+'-'+sys.argv[2]+'.txt','w+')
log_file_name = "TRIAL-9-SoftAttention-Concat-WithTB"
f = open(log_file_name+".txt", "w+")
while i_episode<n_episode:
    if i_episode > 40:
        epsilon -= 0.001
        if epsilon < 0.01:
            epsilon = 0.01
    i_episode+=1
    steps = 0
    obs, adj = env.reset()
    obs = np.resize(obs, (n_tasks, observation_space))
    episode_summary_writer = tb.SummaryWriter(log_dir='./TB-Logs/'+log_file_name)
    episode_epoch_count = 0
    while steps < max_step:
        steps+=1 
#         cost_all += adj.sum()
#         v_a = np.array(att(torch.Tensor(np.array([obs])).cuda())[0].cpu().data)
#         for i in range(n_tasks):
#             if np.random.rand() < epsilon:
#                 adj[i] = adj[i]*0 if np.random.rand() < 0.5 else adj[i]*1
#             else:
#                 adj[i] = adj[i]*0 if v_a[i][0] < threshold else adj[i]*1
        # Note: above loop is epsilon greedy exploration to give less importance to observations that fall below a certain threshold
        # May not be needed if we use single encoder but could be useful in the case of mixture of encoders
        # Pruning "less imp" neighbours whose obs fall below a certain threshold
#         n_adj = adj*comm_flag
#         cost_comm += n_adj.sum()
#         n_adj = n_adj + np.eye(n_tasks)
#         q_dummy = model(torch.Tensor(np.array([obs])).cuda(), torch.Tensor(np.array([adj])).cuda())
#         print("model output shape", q_dummy.shape)
        q = model(torch.Tensor(np.array([obs])).cuda(), torch.Tensor(np.array([adj])).cuda())[0,0,:]
#         print("Shape of Q: ", q.shape)
        if np.random.rand() < epsilon:
#             print("HERE RANDOM")
            a = np.random.randint(n_actions)
        else:
#             print("HERE FROM MODEL")
            a = q.argmax().item()

        action = a
        
        next_obs, next_adj, reward, terminated = env.step(action)
#         print('action: {}, next_obs: {}\nnext_adj:\n{}'.format(
#             action, next_obs, next_adj
#         ))
        
        next_obs = np.resize(next_obs, (n_tasks, observation_space))
        
        buff.add(np.array(obs),action,reward,np.array(next_obs),adj,next_adj,terminated)
        
        obs = next_obs
        adj = next_adj
        score += reward

    if i_episode%20==0:
        print(score)
        #print(score/2000)
        f.write(str(score)+'\n')
        episode_summary_writer.add_scalar("Score/Episode", score, i_episode)
        # Cost (neighbors in adj matrix)after pruning/ Cost before pruning
        f.flush()
        score = 0

#     if i_episode < 40:
#         continue

    for e in range(n_epoch):

        episode_epoch_count += 1
        O,A,R,Next_O,Matrix,Next_Matrix,D = buff.getBatch(batch_size)
        O = torch.Tensor(O).cuda()
        Matrix = torch.Tensor(Matrix).cuda()
        Next_O = torch.Tensor(Next_O).cuda()
        Next_Matrix = torch.Tensor(Next_Matrix).cuda()

#         label = model(Next_O, Next_Matrix+M_Null).max(dim = 2)[0] - model(Next_O, M_Null).max(dim = 2)[0]
#         #print("Label", label.shape)
#         label = (label - label.mean())/(label.std()+0.000001) + 0.5
#         label = torch.clamp(label, 0, 1).unsqueeze(-1).detach()
#         #print("Label after clamping", label.shape)
#         #print("ATT output", label_dummy.shape)
#         loss = criterion(a(Next_O), label)
#         optimizer_att.zero_grad()
#         loss.backward()
#         optimizer_att.step()
        # Basically att is learning which obs from the maze help return the max q value

#         V_A_D = att_tar(Next_O).expand(-1,-1,n_ant)
#         Next_Matrix = torch.where(V_A_D > threshold, Next_Matrix, M_ZERO)
#         Next_Matrix = Next_Matrix*comm_flag + M_Null

        q_values = model(O, Matrix)
#         print("Q Vals Before slicing: ", q_values.shape)
        q_values = model(O, Matrix)[:,0, :]
#         print("Q Vals After slicing: ", q_values_final.shape)
        target_q_values = model_tar(Next_O, Next_Matrix).max(dim = 2)[0][:,0]
#         print("Target Q Vals: ", target_q_values.shape)
        target_q_values = np.array(target_q_values.cpu().data)
        expected_q = np.array(q_values.cpu().data)

        for j in range(batch_size):
#             for i in range(n_tasks):
            expected_q[j][A[j][0]] = R[j][0] + (1-D[j][0])*GAMMA*target_q_values[j]

        loss = (q_values - torch.Tensor(expected_q).cuda()).pow(2).mean()
        episode_summary_writer.add_scalar('Loss', loss, episode_epoch_count)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if e%10 == 0:
            with torch.no_grad():
                for p, p_targ in zip(model.parameters(), model_tar.parameters()):
                    p_targ.data.mul_(tau)
                    p_targ.data.add_((1 - tau) * p.data)
#                 for p, p_targ in zip(att.parameters(), att_tar.parameters()):
#                     p_targ.data.mul_(tau)
#                     p_targ.data.add_((1 - tau) * p.data)

TASK NUMBER  0  DEST:  23 6
TASK NUMBER  1  DEST:  34 33
TASK NUMBER  0  DEST:  31 25
TASK NUMBER  1  DEST:  4 46
TASK NUMBER  0  DEST:  21 28
TASK NUMBER  1  DEST:  1 19
Task  1  completed at step  220
TASK NUMBER  0  DEST:  27 39
TASK NUMBER  1  DEST:  3 39
TASK NUMBER  0  DEST:  8 44
TASK NUMBER  1  DEST:  0 34
TASK NUMBER  0  DEST:  28 42
TASK NUMBER  1  DEST:  15 49
TASK NUMBER  0  DEST:  21 21
TASK NUMBER  1  DEST:  32 46
Task  0  completed at step  44
TASK NUMBER  0  DEST:  17 8
TASK NUMBER  1  DEST:  5 14
TASK NUMBER  0  DEST:  26 38
TASK NUMBER  1  DEST:  14 33
TASK NUMBER  0  DEST:  20 9
TASK NUMBER  1  DEST:  46 3
TASK NUMBER  0  DEST:  11 45
TASK NUMBER  1  DEST:  19 19
TASK NUMBER  0  DEST:  47 45
TASK NUMBER  1  DEST:  23 18
TASK NUMBER  0  DEST:  42 32
TASK NUMBER  1  DEST:  6 36
TASK NUMBER  0  DEST:  1 16
TASK NUMBER  1  DEST:  17 23
TASK NUMBER  0  DEST:  32 15
TASK NUMBER  1  DEST:  32 12
TASK NUMBER  0  DEST:  35 49
TASK NUMBER  1  DEST:  15 2
Task  1  completed at 