In [None]:
from utils.CoopEnv import CoopEnv
from utils.IACagent import Agent

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# Set constant hyperparameters
n = 20
num_of_tasks  = 10
num_of_sims = 3
step_limit = 12000

In [None]:
lower_bound = 0
upper_bound = 2.00
comm_array = np.arange(lower_bound, upper_bound+0.01, 0.01)

comm_space = len(comm_array)

In [None]:
env = CoopEnv(n=n, num_of_tasks=num_of_tasks)

In [None]:
steps_to_complete = np.zeros((num_of_sims, step_limit+1))
comm_rec_actions = np.zeros((step_limit, env.n))
move_rec_actions = np.zeros((step_limit, env.n))
rec_rewards = np.zeros((step_limit, env.n))

In [None]:
def main():

    for s in range(num_of_sims):

        print("----------------------------------")
        print(f"Starting training simulation {s}:")
        print("----------------------------------")

        # reinitialise agent list each sim
        # reset the environment
        agentlist = {i: Agent(comm_space, alr=3e-4, vlr=6e-4, clr=1e-4 ) for i in range(env.n)}
        observations = env.reset(n=n, num_of_tasks =num_of_tasks)

        # begin training loop

        t = 0
        while t < step_limit:
            print(t, end='\r')

            # collect move action for each agent
            move_action = np.array([agentlist[i].choose_action_move(observations[i]) for i in range(env.n)])

            next_observations, context = env.step1(move_action)

            comm_action = np.array([agentlist[i].choose_action_comm(context[i]) for i in range(env.n)])
            comm_action = comm_array[comm_action]

            rewards = env.step2(comm_action) # env.step

            comm_rec_actions[t] = comm_action  # output from actor (comm head)
            move_rec_actions[t] = move_action

            for i in range(env.n): # train each agent
                action = np.array([move_action[i], comm_action[i]])
                singleton_val = env.singleton_vals[f'Player {i+1}']
                agentlist[i].train(observations[i], action, rewards[i], next_observations[i], context[i], singleton_val)

            # advance state
            observations = next_observations
            # record average system reward score
            steps_to_complete[s, t+1] = sum(rewards)/env.n
            rec_rewards[t, :] = rewards

            # increment timestep and loop
            t +=1


In [None]:
if __name__ == "__main__":

    #cProfile.run('main()')
    main()


In [None]:
# save reward and actions
converged_structure = np.zeros((num_of_tasks, env.n))
task = 0
for C in env.CS:
    indices = [int(a)-1 for a in list(C)]
    converged_structure[task, indices] = 1
    task+=1

np.save(f'n{n}t{num_of_tasks}_cum_rew_bc', steps_to_complete)
np.save(f'n{n}t{num_of_tasks}_actions_bc', comm_rec_actions)
np.save(f'n{n}t{num_of_tasks}_cstruct_cnf0.npy', converged_structure)

In [None]:
plt.style.use('ggplot')
#b = np.mean(steps_to_complete, axis=0)
b = steps_to_complete.flatten()[0:5000]
s_dev = np.std(steps_to_complete, axis=0)

def moving_average(arr, window_size):
    moving_averages = []
    for i in range(len(arr) - window_size + 1):
        window = arr[i:i + window_size]
        average = sum(window) / window_size
        moving_averages.append(average)
    return moving_averages

plt.xlabel('Number of Steps Played')
plt.ylabel('Cumulative Average System Reward')
plt.title(f'IAC on cooperative game; n={n}, tasks={num_of_tasks}')
fig = plt.gcf()
fig.set_size_inches(10.5, 10.5)
plt.plot(moving_average(b,20))

In [None]:
plt.xlabel('Number of Steps Played')
plt.ylabel('Comm Bias')
plt.title(f'Communication action for cooperative game, action_space=[{lower_bound},{upper_bound}]')
fig = plt.gcf()
fig.set_size_inches(10.5, 10.5)

plt.plot(moving_average(comm_rec_actions,21))

In [None]:
print(env.CS)
for j in range(env.n):
    plt.plot(moving_average(rec_rewards[:,j], 20))

fig = plt.gcf()
fig.set_size_inches(10.5, 10.5)

In [None]:
# Execution Cell

observations = env.reset(n=n, num_of_tasks =num_of_tasks)

# begin training loop
for i in range(1000):
    # collect move action for each agent
    move_action = np.array([agentlist[i].choose_action_move(observations[i]) for i in range(env.n)])

    next_observations, context = env.step1(move_action)

    comm_action = np.array([agentlist[i].choose_action_comm(context[i]) for i in range(env.n)])
    comm_action = comm_array[comm_action]

    rewards = env.step2(comm_action) # env.step
    print(sum(rewards)/env.n)

    # advance state
    observations = next_observations