https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/config.py


In [1]:
hidden_dim = 64
max_step = 500
GAMMA = 0.99
n_episode = 800
i_episode = 0
capacity = 6500 #change back to 65000
batch_size = 32 #change back to 64
n_epoch = 25
epsilon = 0.9
score = 0
comm_flag = 1
threshold = -0.1
tau = 0.98
cost_all = 0
cost_comm = 0

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/buffer.py

In [2]:
import numpy as np
class ReplayBuffer(object):

	def __init__(self, buffer_size, obs_space, n_action, n_ant):
		self.buffer_size = buffer_size
		self.n_ant = n_ant
		self.pointer = 0
		self.len = 0
		self.actions = np.zeros((self.buffer_size,self.n_ant),dtype = np.int32)
		self.rewards = np.zeros((self.buffer_size,n_ant))
		self.dones = np.zeros((self.buffer_size,1))
		self.obs = np.zeros((self.buffer_size,self.n_ant,obs_space))
		self.next_obs = np.zeros((self.buffer_size,self.n_ant,obs_space))
		self.matrix = np.zeros((self.buffer_size,self.n_ant,self.n_ant))
		self.next_matrix = np.zeros((self.buffer_size,self.n_ant,self.n_ant))

	def getBatch(self, batch_size):

		index = np.random.choice(self.len, batch_size, replace=False)
		return self.obs[index], self.actions[index], self.rewards[index], self.next_obs[index], self.matrix[index], self.next_matrix[index], self.dones[index]

	def add(self, obs, action, reward, next_obs, matrix, next_matrix, done):

		self.obs[self.pointer] = obs
		self.actions[self.pointer] = action
		self.rewards[self.pointer] = reward
		self.next_obs[self.pointer] = next_obs
		self.matrix[self.pointer] = matrix
		self.next_matrix[self.pointer] = next_matrix
		self.dones[self.pointer] = done
		self.pointer = (self.pointer + 1)%self.buffer_size
		self.len = min(self.len + 1, self.buffer_size)

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/model.py

In [3]:
import math, random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

class ATT(nn.Module):
	def __init__(self, din):
		super(ATT, self).__init__()
		self.fc1 = nn.Linear(din, 64)
		self.fc2 = nn.Linear(64, 64)
		self.fc3 = nn.Linear(64, 1)

	def forward(self, x):
		y = F.relu(self.fc1(x))
		y = F.relu(self.fc2(y))
		y = F.sigmoid(self.fc3(y))
		return y
		
class Encoder(nn.Module): # TODO: Need to make it a CNN for higher dim obs space like MetaWorld
	def __init__(self, din=32, hidden_dim=128):
		super(Encoder, self).__init__()
		self.fc = nn.Linear(din, hidden_dim)

	def forward(self, x):
		embedding = F.relu(self.fc(x))
		return embedding

class AttModel(nn.Module):
	def __init__(self, n_node, din, hidden_dim, dout):
		super(AttModel, self).__init__()
		self.fcv = nn.Linear(din, hidden_dim)
		self.fck = nn.Linear(din, hidden_dim)
		self.fcq = nn.Linear(din, hidden_dim)
		self.fcout = nn.Linear(hidden_dim, dout)

	def forward(self, x, mask):
		v = F.relu(self.fcv(x))
		q = F.relu(self.fcq(x))
		k = F.relu(self.fck(x)).permute(0,2,1)
		att = F.softmax(torch.mul(torch.bmm(q,k), mask) - 9e15*(1 - mask),dim=2)
        # Note: Order of applying adj matrix is different than that in paper. Don't get confused!
		out = torch.bmm(att,v)
		#out = torch.add(out,v)
		#out = F.relu(self.fcout(out))
		return out

class Q_Net(nn.Module):
	def __init__(self, hidden_dim, dout):
		super(Q_Net, self).__init__()
		self.fc = nn.Linear(hidden_dim, dout)

	def forward(self, x):
		q = self.fc(x)
		return q

class DGN(nn.Module):
	def __init__(self,n_agent,num_inputs,hidden_dim,num_actions):
		super(DGN, self).__init__()
		
		self.encoder = Encoder(num_inputs,hidden_dim)
        # TODO: Try both single encoder and mix of encoder settings
        # Will remain same for MTRL
		self.att_1 = AttModel(n_agent,hidden_dim,hidden_dim,hidden_dim)
		self.att_2 = AttModel(n_agent,hidden_dim,hidden_dim,hidden_dim)
		self.q_net = Q_Net(hidden_dim,num_actions)
        # Q Net remains same for MTRL
		
	def forward(self, x, mask):
		h1 = self.encoder(x)
		h2 = self.att_1(h1, mask)
		h3 = self.att_2(h2, mask)
        # TODO: try concatentation for MTRL
		q = self.q_net(h3)
        # Note: No concatenation done. Output of last attention head used directly
        # Note: 2 attention heads used
		return q 

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/surviving.py

In [4]:
import numpy as np
import copy

def is_legal(x,y):

	return (x>=1)&(x<=30)&(y>=1)&(y<=30)

class Surviving(object):
	def __init__(self, n_agent):
		super(Surviving, self).__init__()
		self.n_agent = n_agent
		self.n_action = 5
		self.max_food = 10
		self.capability = 2*self.n_agent

		self.maze = self.build_env()
		self.ants = []
		for i in range(self.n_agent):
			self.ants.append([np.random.randint(0,30)+1,np.random.randint(0,30)+1])

		self.foods = []
		for i in range(self.n_agent):
			self.foods.append(self.max_food)

		self.n_resource = 8
		self.resource = []
		self.resource_pos = []
		for i in range(self.n_resource):
			self.resource_pos.append([np.random.randint(0,30)+1,np.random.randint(0,30)+1])
			self.resource.append(np.random.randint(100,120))
		
		self.steps = 0
		self.len_obs = 29

	def reset(self):

		self.maze = self.build_env()

		self.ants = []
		for i in range(self.n_agent):
			self.ants.append([np.random.randint(0,30)+1,np.random.randint(0,30)+1])

		self.foods = []
		for i in range(self.n_agent):
			self.foods.append(self.max_food)

		self.resource = []
		self.resource_pos = []
		for i in range(self.n_resource):
			self.resource_pos.append([np.random.randint(0,30)+1,np.random.randint(0,30)+1])
			self.resource.append(np.random.randint(100,120))

		return self.get_obs(), self.get_adj()

	def build_env(self):

		maze = np.zeros((32,32))
		for i in range(32):
			maze[0][i] = -1
			maze[i][0] = -1
			maze[31][i] = -1
			maze[i][31] = -1

		return maze

	def get_obs(self):
        
       # TODO: change this for MTRL 

		obs = []

		maze_ant = np.zeros((32,32))
		for index in range(self.n_agent):
			x = self.ants[index][0]
			y = self.ants[index][1]
			maze_ant[x][y] = 1

		for index in range(self.n_agent):
			h = []
			x = self.ants[index][0]
			y = self.ants[index][1]
			for i in range(5): # TODO - Understand
				h.append(np.mod(x,2))
				x = int(x/2)
			for i in range(5): # TODO - Understand
				h.append(np.mod(y,2))
				y = int(y/2)
			x_t = self.ants[index][0]
			y_t = self.ants[index][1]
			for i in range(-1,2):
				for j in range(-1,2):
					h.append(self.maze[x_t+i][y_t+j])

			for i in range(-1,2):
				for j in range(-1,2):
					h.append(maze_ant[x_t+i][y_t+j])

			h.append(self.foods[index])
			obs.append(h)

		return obs

	def get_adj(self): # TODO: Change this to use task description encoding

		adj = np.zeros((self.n_agent,self.n_agent))

		for index in range(self.n_agent):
			x = self.ants[index][0]
			y = self.ants[index][1]
			for i in range(index):
				x1 = self.ants[i][0]
				y1 = self.ants[i][1]
				if (np.abs(x-x1)<=3)|(np.abs(y-y1)<=3):
					adj[index][i] = 1
					adj[i][index] = 1

		return adj 


	def step(self,actions):

		for i in range(self.n_agent):
			x = self.ants[i][0]
			y = self.ants[i][1]
			
			if actions[i] == 0:
				if self.maze[x-1][y]!= -1:
					 self.ants[i][0] = x-1
			if actions[i] == 1:
				if self.maze[x+1][y]!= -1:
					 self.ants[i][0] = x+1
			if actions[i] == 2:
				if self.maze[x][y-1]!= -1:
					 self.ants[i][1] = y-1
			if actions[i] == 3:
				if self.maze[x][y+1]!= -1:
					 self.ants[i][1] = y+1
			if actions[i] == 4:
				self.foods[i] += 2*self.maze[x][y]
				self.maze[x][y] = 0

			self.foods[i] = max(0,min(self.foods[i]-1,self.max_food))

		reward = [0.4]*self.n_agent
		for i in range(self.n_agent):
			if self.foods[i] == 0:
				reward[i] = - 0.2

		done = False

		if (self.maze.sum()+120) > self.capability: # TODO: Understand

			return self.get_obs(), self.get_adj(), reward, done

		for i in range(self.n_resource):

			x = self.resource_pos[i][0] + np.random.randint(-3,4)
			y = self.resource_pos[i][1] + np.random.randint(-3,4)

			if is_legal(x,y):

				num = np.random.randint(1,6)
				self.maze[x][y] += num
				self.maze[x][y] = min(self.maze[x][y],5)
				self.resource[i] -= num

				if self.resource[i] <= 0:
					self.resource_pos[i][0] = np.random.randint(0,30)+1
					self.resource_pos[i][1] = np.random.randint(0,30)+1
					self.resource[i] = np.random.randint(100,120)

		return self.get_obs(), self.get_adj(), reward, done

In [None]:
import numpy as np
import copy

MAZE_DIM = 16
NUM_TASKS = 3

def is_legal(x,y):

	return (x>=1)&(x<=30)&(y>=1)&(y<=30)

class SurvivingGCare(object):
	def __init__(self, n_agent):
        """
        """
		super(SurvivingGCare, self).__init__()
		self.n_action = 4
        # TODO: maybe include food as part of task, reach dest with > 0 food or something
		self.capability = 2*self.n_agent
        self.tasks = [0]*NUM_TASKS
        self.agent = [-1, -1]
		self.build_env()
		self.steps = 0
# 		self.len_obs = 29

	def reset(self):
        """
        """

		self.build_env()
		return self.get_obs(), self.get_adj()

	def build_env(self):
        """
        """
		self.maze = np.zeros((MAZE_DIM,MAZE_DIM))
        for i in range(NUM_TASKS):
            x = np.random.randint(0, MAZE_DIM)
            y = np.random.randint(0, MAZE_DIM)
            self.tasks[i] = [x, y]
            self.maze[x][y] = 1
        self.agent[0] = np.random.randint(0, MAZE_DIM)
        self.agent[1] = np.random.randint(0, MAZE_DIM)
        self.maze[self.agent[0]][self.agent[1]] = -1

	def get_obs(self):
        """
        """
       # TODO: change this for MTRL 
		obs = []
        
        x_agent = self.agent[0]
        y_agent = self.agent[1]
        
        obs.append(x_agent)
        obs.append(y_agent)
        
# 		for i in range(-1,2):
# 			for j in range(-1,2):
# 				obs.append(self.maze[x_agent+i][y_agent+j])
        
        for i in range(NUM_TASKS):
            obs.append(self.tasks[i][0])
            obs.append(self.tasks[i][1])
            
        # TODO: 1. if we include maze state or not, and if we do, we would need to figure out
        # how to effectively send that along with task destinations
            
		return obs

	def get_adj(self): # TODO: Change this to use task description encoding. 
        # In this case task description is the location of the destination.

		adj = np.zeros((self.n_agent,self.n_agent))

		for index in range(self.n_agent):
			x = self.ants[index][0]
			y = self.ants[index][1]
			for i in range(index):
				x1 = self.ants[i][0]
				y1 = self.ants[i][1]
				if (np.abs(x-x1)<=3)|(np.abs(y-y1)<=3):
					adj[index][i] = 1
					adj[i][index] = 1

		return adj 


	def step(self,actions):

		for i in range(self.n_agent):
			x = self.ants[i][0]
			y = self.ants[i][1]
			
			if actions[i] == 0:
				if self.maze[x-1][y]!= -1:
					 self.ants[i][0] = x-1
			if actions[i] == 1:
				if self.maze[x+1][y]!= -1:
					 self.ants[i][0] = x+1
			if actions[i] == 2:
				if self.maze[x][y-1]!= -1:
					 self.ants[i][1] = y-1
			if actions[i] == 3:
				if self.maze[x][y+1]!= -1:
					 self.ants[i][1] = y+1
			if actions[i] == 4:
				self.foods[i] += 2*self.maze[x][y]
				self.maze[x][y] = 0

			self.foods[i] = max(0,min(self.foods[i]-1,self.max_food))

		reward = [0.4]*self.n_agent
		for i in range(self.n_agent):
			if self.foods[i] == 0:
				reward[i] = - 0.2

		done = False

		if (self.maze.sum()+120) > self.capability: # TODO: Understand

			return self.get_obs(), self.get_adj(), reward, done

		for i in range(self.n_resource):

			x = self.resource_pos[i][0] + np.random.randint(-3,4)
			y = self.resource_pos[i][1] + np.random.randint(-3,4)

			if is_legal(x,y):

				num = np.random.randint(1,6)
				self.maze[x][y] += num
				self.maze[x][y] = min(self.maze[x][y],5)
				self.resource[i] -= num

				if self.resource[i] <= 0:
					self.resource_pos[i][0] = np.random.randint(0,30)+1
					self.resource_pos[i][1] = np.random.randint(0,30)+1
					self.resource[i] = np.random.randint(100,120)

		return self.get_obs(), self.get_adj(), reward, done

https://github.com/jiechuanjiang/pytorch_DGN/blob/main/Surviving/DGN%2BATOC/main.py

In [None]:
import math, random, copy
import numpy as np
import os,sys

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F

# from model import DGN, ATT
# from buffer import ReplayBuffer
# from surviving import Surviving
# from config import *

USE_CUDA = torch.cuda.is_available()

env = Surviving(n_agent = 100)
n_ant = env.n_agent
observation_space = env.len_obs
n_actions = env.n_action

buff = ReplayBuffer(capacity,observation_space,n_actions,n_ant)
model = DGN(n_ant,observation_space,hidden_dim,n_actions)
model_tar = DGN(n_ant,observation_space,hidden_dim,n_actions)
model = model.cuda()
model_tar = model_tar.cuda()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
att = ATT(observation_space).cuda()
att_tar = ATT(observation_space).cuda()
att_tar.load_state_dict(att.state_dict())
optimizer_att = optim.Adam(att.parameters(), lr = 0.0001)
criterion = nn.BCELoss()

M_Null = torch.Tensor(np.array([np.eye(n_ant)]*batch_size)).cuda()
M_ZERO = torch.Tensor(np.zeros((batch_size,n_ant,n_ant))).cuda()
# threshold = float(sys.argv[1]) TODO: figure this out
# f = open(sys.argv[1]+'-'+sys.argv[2]+'.txt','w+')
f = open("TRIAL.txt", "w+")
while i_episode<n_episode:

	if i_episode > 40:
		epsilon -= 0.001
		if epsilon < 0.01:
			epsilon = 0.01
	i_episode+=1
	steps = 0
	obs, adj = env.reset()
	while steps < max_step:
		steps+=1 
		action=[]
		cost_all += adj.sum()
		v_a = np.array(att(torch.Tensor(np.array([obs])).cuda())[0].cpu().data)
		for i in range(n_ant):
			if np.random.rand() < epsilon:
				adj[i] = adj[i]*0 if np.random.rand() < 0.5 else adj[i]*1
			else:
				adj[i] = adj[i]*0 if v_a[i][0] < threshold else adj[i]*1
        # Note: above loop is epsilon greedy exploration to give less importance to observations that fall below a certain threshold
        # May not be needed if we use single encoder but could be useful in the case of mixture of encoders
        # Pruning "less imp" neighbours whose obs fall below a certain threshold
		n_adj = adj*comm_flag
		cost_comm += n_adj.sum()
		n_adj = n_adj + np.eye(n_ant)
		q_dummy = model(torch.Tensor(np.array([obs])).cuda(), torch.Tensor(np.array([n_adj])).cuda())
# 		print("model output shape", q_dummy.shape)
		q = model(torch.Tensor(np.array([obs])).cuda(), torch.Tensor(np.array([n_adj])).cuda())[0]
		for i in range(n_ant):
			if np.random.rand() < epsilon:
				a = np.random.randint(n_actions)
			else:
				a = q[i].argmax().item()
			action.append(a)

		next_obs, next_adj, reward, terminated = env.step(action)

		buff.add(np.array(obs),action,reward,np.array(next_obs),n_adj,next_adj,terminated)
		obs = next_obs
		adj = next_adj
		score += sum(reward)

	if i_episode%20==0:
		print(score/2000)
#         print(score/2000)
		f.write(str(score/2000)+'	'+str(cost_comm/cost_all)+'\n')
        # Cost (neighbors in adj matrix)after pruning/ Cost before pruning
		f.flush()
		score = 0
		cost_comm = 0
		cost_all = 0

	if i_episode < 40:
		continue

	for e in range(n_epoch):
		
		O,A,R,Next_O,Matrix,Next_Matrix,D = buff.getBatch(batch_size)
		O = torch.Tensor(O).cuda()
		Matrix = torch.Tensor(Matrix).cuda()
		Next_O = torch.Tensor(Next_O).cuda()
		Next_Matrix = torch.Tensor(Next_Matrix).cuda()

		label = model(Next_O, Next_Matrix+M_Null).max(dim = 2)[0] - model(Next_O, M_Null).max(dim = 2)[0]
		print("Label", label.shape)
		label = (label - label.mean())/(label.std()+0.000001) + 0.5
		label = torch.clamp(label, 0, 1).unsqueeze(-1).detach()
		print("Label after clamping", label.shape)
		label_dummy = att(Next_O)
		print("ATT output", label_dummy.shape)
		loss = criterion(att(Next_O), label)
		optimizer_att.zero_grad()
		loss.backward()
		optimizer_att.step()
        # Basically att is learning which obs from the maze help return the max q value

		V_A_D = att_tar(Next_O).expand(-1,-1,n_ant)
		Next_Matrix = torch.where(V_A_D > threshold, Next_Matrix, M_ZERO)
		Next_Matrix = Next_Matrix*comm_flag + M_Null

		q_values = model(O, Matrix)
		target_q_values = model_tar(Next_O, Next_Matrix).max(dim = 2)[0]
		target_q_values = np.array(target_q_values.cpu().data)
		expected_q = np.array(q_values.cpu().data)
		
		for j in range(batch_size):
			for i in range(n_ant):
				expected_q[j][i][A[j][i]] = R[j][i] + (1-D[j])*GAMMA*target_q_values[j][i]
		
		loss = (q_values - torch.Tensor(expected_q).cuda()).pow(2).mean()
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		with torch.no_grad():
			for p, p_targ in zip(model.parameters(), model_tar.parameters()):
				p_targ.data.mul_(tau)
				p_targ.data.add_((1 - tau) * p.data)
			for p, p_targ in zip(att.parameters(), att_tar.parameters()):
				p_targ.data.mul_(tau)
				p_targ.data.add_((1 - tau) * p.data)

-87.89099999999782
-89.48079999999698
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT output torch.Size([32, 100, 1])
Label torch.Size([32, 100])
Label after clamping torch.Size([32, 100, 1])
ATT outp

In [None]:
class EatingInMaze():
    # Class for eating some foods in a maze as efficient as possible
    # The agents will be out to random places
    def __init__(self, num_of_agents, num_of_foods, grid_size, communication_range):
        # grid_size: tuple of width and height of the maze
        # communication_range: number of cells that a certain agent can communicate 
        self.num_of_agents = num_of_agents
        self.num_of_foods = num_of_foods
        self.grid_size = grid_size
        # observations will be an array with the maze values
        self.obs_range = 1 # number of cells around each agent that an agent can observe
        self.comm_range = communication_range # number of cells around each agent that an agent can communicate with another agent - helps with cooperation

        self.max_life = 10 # Each agent will have 10 lives in the beginning

        self.reset()
        
    def get_maze_index(self, rand_index):
        return (int(rand_index/self.grid_size[1]), int(rand_index%self.grid_size[1]))

    def get_agent_index(self, x, y):
        return int(x * self.grid_size[1] + y)

    def is_inside_maze(self, x, y):
        return (x >= 0) & (x < self.grid_size[0]) & (y >= 0) & (y < self.grid_size[1])

    def build_maze(self): # The method to build the maze 
        # TODO
        # Put the agents in a random position in the maze 
        # And add some random foods
        self.maze = np.zeros((self.grid_size[0], self.grid_size[1]))
        # print(self.maze)
        # Randomly put the foods and agents here
        self.food_indices = np.ones(self.num_of_foods) * -1
        self.agent_indices = np.ones(self.num_of_agents) * -1
        for i in range(self.num_of_foods + self.num_of_agents):
            rand_index = np.random.randint(0, self.grid_size[0] * self.grid_size[1] - 1)
            while (rand_index in self.food_indices) or (rand_index in self.agent_indices): # Make sure that food and agent will not be put in the same cell
                rand_index = np.random.randint(0, self.grid_size[0] * self.grid_size[1] - 1)

            if i < self.num_of_foods:
                self.food_indices[i] = rand_index
                # Put the food to the maze - if there is a food in the cell then maze should have 1 in that particular cell
                self.maze[self.get_maze_index(rand_index)[0]][self.get_maze_index(rand_index)[1]] = 1
            else:
                self.agent_indices[i - self.num_of_foods] = rand_index
                # Put the agent to the maze - if there is an agent in the cell then the maze should have 2
                self.maze[self.get_maze_index(rand_index)[0]][self.get_maze_index(rand_index)[1]] = 2 # TODO: this might be problematic

        # print('self.food_indices: {}, self.agent_indices: {}'.format(self.food_indices, self.agent_indices))
        print('maze:\n{}'.format(self.maze))

    def reset(self): # Resets the environment
        # TODO

        self.lives = np.ones(self.num_of_agents) * self.max_food

        self.build_maze()
        obs = self.get_obs()
        adj = self.get_adj()

        return obs, adj

    def get_obs(self):
        # Traverse the whole maze and for each agent get the maze values from the surroundings
        obs = []
        for agent_index in self.agent_indices:
            x, y = self.get_maze_index(agent_index) # Get the position of the agent in the grid
            print('agent_index: {}, x: {}, y: {}, get_agent_index(x,y): {}'.format(agent_index, x, y, self.get_agent_index(x,y)))
            # Get the values from cells that are self.obs_range away from the agent
            curr_obs = []
            for i in range(-1,2): # i will be -1,0,1
                for j in range(-1,2):
                    if i != 0 or j != 0: # We don't want to get the agent exactly
                        if self.is_inside_maze(x+i, y+j):
                            curr_obs.append(self.maze[x+i][y+j])
                        else:
                            curr_obs.append(-1) # If this is not good then it will be -1

            # print('curr_obs: {}'.format(curr_obs))
            obs.append(curr_obs)
            
        print('obs:\n{}'.format(np.array(obs)))
        return obs

    def get_adj(self):
        # TODO

        adj = np.zeros((self.num_of_agents, self.num_of_agents))

        # Check if there are any agents that are closer to her than communication range and insert 1 in the adjacency matrix
        # if that is the case
        for i in range(self.num_of_agents):
            curr_x, curr_y = self.get_maze_index(self.agent_indices[i])
            for j in range(self.num_of_agents):
                other_x, other_y = self.get_maze_index(self.agent_indices[j])
                # Calculate the distance between current and other agents
                dist = math.sqrt((curr_x - other_x)**2 + (curr_y - other_y)**2)
                if dist < self.comm_range:
                    adj[i][j] = 1

        print('adj:\n{}'.format(adj))
        return adj

    def step(self, actions):
        # TODO
        # actions: [agent#1 action, agent#2 action, ...]

        for i in range(self.num_of_agents):
            x, y = self.get_maze_index(self.agent_indices[i])

            if actions[i] == 0: # Move up (decrease x by one)
                if self.is_inside_maze(x-1,y): # We can go up
                    # Change agent_indices and maze
                    self.agent_indices[i] = self.get_agent_index(x-1, y)
                    self.maze[x][y] = 0
                    self.maze[x-1][y] = 2 # Moved the agent
            
            if actions[i] == 1: # Move down (increase x by one)
                if self.is_inside_maze(x+1,y): # We can go up
                    # Change agent_indices and maze
                    self.agent_indices[i] = self.get_agent_index(x+1, y)
                    self.maze[x][y] = 0
                    self.maze[x+1][y] = 2 # Moved the agent
            
            if actions[i] == 2: # Move left (decrease Y by one)
                if self.is_inside_maze(x,y-1): # We can go left
                    # Change agent_indices and maze
                    self.agent_indices[i] = self.get_agent_index(x, y-1)
                    self.maze[x][y] = 0
                    self.maze[x][y-1] = 2 # Moved the agent
            
            if actions[i] == 3: # Move left (decrease Y by one)
                if self.is_inside_maze(x,y+1): # We can go left
                    # Change agent_indices and maze
                    self.agent_indices[i] = self.get_agent_index(x, y+1)
                    self.maze[x][y] = 0
                    self.maze[x][y+1] = 2 # Moved the agent

            if actions[i] == 4: # Eat the food in this cell
                self.maze[x][y] = 0

            # Decrease life of each agent by one - in each step life of an agent is decreased
            self.lives[i] = max(0,min(self.lives[i]-1,self.max_life))

        # Get rewards
        reward = [0.4]*self.num_of_agents
        for i in range(self.num_of_agents):
            if self.lives[i] == 0: # self.foods also stand for health
                reward[i] = - 0.2

        # If there are no more foods in the whole maze, then the task is done
        done = not (1 in self.maze)
        print('done: {}'.format(done)) 

        return self.get_obs(), self.get_adj(), reward, done


env = EatingInMaze(num_of_agents=6, num_of_foods=5, grid_size=(6,5), communication_range=3)