In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
import numpy as np
import gym

In [None]:
#The MDP environment
class MultiAgentEnvironment:
    def __init__(self, n_agents, n_states, n_actions, transition_kernels, rewards):
        self.n_agents = n_agents
        self.n_states = n_states
        self.n_actions = n_actions
        self.transition_kernels = transition_kernels
        self.rewards = rewards
        self.states = np.arange(n_states)

    def step(self, agent_id, current_state, action):
        transition_kernel = self.transition_kernels[agent_id]
        reward = self.rewards[agent_id]
        transition_probs = transition_kernel[current_state][action]
        next_state = np.random.choice(self.states, p=transition_probs)
        reward_s = reward[next_state]
        return next_state, reward_s

    @staticmethod
    def kernel_sa_generation(num_agents):
        num_states = 16 #16
        num_actions = 4 #4
        agent_transition_kernels = []
        for _ in range(num_agents):
            transition_prob_kernel = np.random.rand(num_states, num_actions, num_states)
            transition_prob_kernel /= np.sum(transition_prob_kernel, axis=2, keepdims=True)
            agent_transition_kernels.append(transition_prob_kernel)
        return agent_transition_kernels

    @staticmethod
    def reward_generation(num_agents):
        num_states = 16 #16
        rewards = []
        for _ in range(num_agents):
            reward = np.random.rand(num_states)
            rewards.append(reward)
        return rewards

#Using class methods to genratate transition kernals and rewards
K1 = MultiAgentEnvironment.kernel_sa_generation(5)#5
R1 = MultiAgentEnvironment.reward_generation(5)#5

#Create manual environments
manual_envs = MultiAgentEnvironment(n_agents=5, n_states=16, n_actions=4, transition_kernels=K1, rewards=R1)#5，16，4

next_state, reward = manual_envs.step(agent_id=3, current_state=0, action=2)#3，0，2
print("Next state and reward for agent 0:", next_state, reward)

In [None]:
!pip install -q swig
!pip3 install box2d-py
!pip3 install gym[Box_2D]

In [None]:
train_env = gym.make('LunarLander-v2')
test_env = gym.make('LunarLander-v2')

In [None]:
SEED = 1234

train_env.seed(SEED);
test_env.seed(SEED+1);
np.random.seed(SEED);
torch.manual_seed(SEED);

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.1):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Dropout(dropout),
            nn.PReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Dropout(dropout),
            nn.PReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        self.double()

    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, actor, critic):
        super().__init__()

        self.actor = actor
        self.critic = critic

    def forward(self, state):

        action_pred = self.actor(state)
        value_pred = self.critic(state)

        return action_pred, value_pred

In [None]:
INPUT_DIM = train_env.observation_space.shape[0]
#INPUT_DIM = 1#input:state, dim = 1
HIDDEN_DIM = 128
#OUTPUT_DIM = 4#ouput: num_actions
OUTPUT_DIM = test_env.action_space.n

actor = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
critic = MLP(INPUT_DIM, HIDDEN_DIM, 1)

policynet = ActorCritic(actor, critic)

In [None]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0)

In [None]:
policynet.apply(init_weights)

In [None]:
LEARNING_RATE = 0.0005

#optimizer = optim.Adam(policynet.parameters(), lr = LEARNING_RATE)

In [None]:
def train(env, policy, optimizer, discount_factor):

    policy.train()

    log_prob_actions = []
    values = []
    rewards = []
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)
        state = state.double()
        state = state.to(device)

        action_pred, value_pred = policy(state)
        #action_pred = actor(state)
        #value_pred = critic(state)

        action_prob = F.softmax(action_pred, dim = -1)

        dist = distributions.Categorical(action_prob)

        action = dist.sample()

        log_prob_action = dist.log_prob(action)

        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        values.append(value_pred)
        rewards.append(reward)

        episode_reward += reward

    log_prob_actions = torch.cat(log_prob_actions)
    values = torch.cat(values).squeeze(-1)

    values = values.to(device)
    log_prob_actions = log_prob_actions.to(device)

    returns = calculate_returns(rewards, discount_factor)
    advantages = calculate_advantages(returns, values)

    policy_loss, value_loss = update_policy(advantages, log_prob_actions, returns, values, optimizer)

    return policy_loss, value_loss, episode_reward

In [None]:
def single_actor_train(env, policy, optimizer, discount_factor):

    policy.train()

    log_prob_actions = []
    values = []
    rewards = []
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)
        state = state.double()
        state = state.to(device)
        #print('state', state)
        action_pred, value_pred = policy(state)
        #action_pred = actor(state)
        #value_pred = critic(state)
        #print('action_pred', action_pred)
        #print('value_pred',value_pred)
        action_prob = F.softmax(action_pred, dim = -1)
        #print('action_prob', action_prob)
        dist = distributions.Categorical(action_prob)
        #print('dist', dist)
        action = dist.sample()
        #print('action', action)
        log_prob_action = dist.log_prob(action)
        #print('logaction', log_prob_action)
        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        values.append(value_pred)

        rewards.append(reward)

        episode_reward += reward

    #print('list values',values)
    #print('log prob actions:', log_prob_actions)
    log_prob_actions = torch.cat(log_prob_actions)
    values = torch.cat(values).squeeze(-1)
    #print('cat values',values)
    values = values.to(device)
    log_prob_actions = log_prob_actions.to(device)

    returns = calculate_returns(rewards, discount_factor)
    #print('returns:', returns)
    #print('values:', values)
    advantages = calculate_advantages(returns, values)

    advantages = advantages.detach()
    policy_loss = - (advantages * log_prob_actions).sum()
    policy_loss = policy_loss.to(device)#calculate the policy loss

    optimizer.zero_grad()

    policy_loss.backward()#optimize the actor network
    optimizer.step()

    return policy_loss.item(), episode_reward

def single_critic_train(env, policy, optimizer, discount_factor):

    policy.train()

    log_prob_actions = []
    values = []
    rewards = []
    done = False
    episode_reward = 0

    TD_loss = []

    state = env.reset()

    state = torch.FloatTensor(state).unsqueeze(0)
    state = state.to(torch.double).to(device)

    k = 0
    while (not done) and (k < 50):

        action_pred, value_pred = policy(state)
        #action_pred = actor(state)
        #value_pred = critic(state)

        action_prob = F.softmax(action_pred, dim = -1)

        dist = distributions.Categorical(action_prob)

        action = dist.sample()

        log_prob_action = dist.log_prob(action)

        state, reward, done, _ = env.step(action.item())

        state = torch.FloatTensor(state).unsqueeze(0)
        state = state.to(torch.double).to(device)

        _, nvalue_pred = policy(state)
        returns = reward + discount_factor * nvalue_pred

        returns = returns.detach()

        value_loss = F.smooth_l1_loss(value_pred, returns)
        value_loss = value_loss.to(device)
        #print("returns:", returns.item())
        #print("prediction:", value_pred.item())

        optimizer.zero_grad()

        value_loss.backward()

        optimizer.step()#optimize the critic network

        values.append(value_pred)
        rewards.append(reward)

        episode_reward += reward

        k = k + 1


    return value_loss.item(), episode_reward


In [None]:
def single_critic_train_v2(env, policy, optimizer, discount_factor):

    policy.train()

    log_prob_actions = []
    values = []
    rewards = []
    done = False
    episode_reward = 0

    TD_loss = []

    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)
        state = state.to(torch.double).to(device)

        action_pred, value_pred = policy(state)
        #action_pred = actor(state)
        #value_pred = critic(state)

        action_prob = F.softmax(action_pred, dim = -1)

        dist = distributions.Categorical(action_prob)

        action = dist.sample()

        log_prob_action = dist.log_prob(action)

        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        values.append(value_pred)
        rewards.append(reward)

        episode_reward += reward

    values = torch.cat(values).squeeze(-1)
    values = values.to(device)

    #print('original values:', values)

    returns = calculate_returns(rewards, discount_factor)#open the normalization with 1; with 0, close
    returns = returns.detach()

    value_loss = F.smooth_l1_loss(returns, values).sum()
    value_loss = value_loss.to(device)

    optimizer.zero_grad()

    value_loss.backward()

    optimizer.step()#optimize the critic network

    return value_loss.item(), episode_reward

In [None]:
def calculate_returns(rewards, discount_factor, normalize = True):

    returns = []
    R = 0

    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = returns.to(device)

    if normalize:

        returns = (returns - returns.mean()) / returns.std()

    return returns

In [None]:
def calculate_advantages(returns, values, normalize = True):

    advantages = returns - values

    if normalize:

        advantages = (advantages - advantages.mean()) / advantages.std()

    return advantages

In [None]:
def update_policy(advantages, log_prob_actions, returns, values, optimizer):

    advantages = advantages.detach()
    returns = returns.detach()

    policy_loss = - (advantages * log_prob_actions).sum()

    value_loss = F.smooth_l1_loss(returns, values).sum()

    policy_loss = policy_loss.to(device)
    value_loss = value_loss.to(device)

    optimizer.zero_grad()

    policy_loss.backward()
    value_loss.backward()

    optimizer.step()

    return policy_loss.item(), value_loss.item()

In [None]:
def evaluate(env, policy):

    policy.eval()

    rewards = []
    done = False
    episode_reward = 0

    state = env.reset()

    torch.set_default_tensor_type(torch.cuda.FloatTensor)

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)
        state = state.to(torch.double).to(device)

        with torch.no_grad():

            action_pred, _ = policy(state)

            action_prob = F.softmax(action_pred, dim = -1)

        action = torch.argmax(action_prob, dim = -1)

        state, reward, done, _ = env.step(action.item())

        episode_reward += reward

    return episode_reward

In [None]:
import torch
import copy
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available!")
else:
    print("GPU is not available.")
policynet = policynet.to(device)

def average_weights(agents):
    weights_sum = {}
    num_agents = len(agents)

    for agent in agents:
        for name, param in agent.named_parameters():
            #print('parameters:', agent.named_parameters())
            #print('name:', name)
            #print('param', param)
            if name not in weights_sum:
                weights_sum[name] = param.data.clone()
            else:
                weights_sum[name] += param.data

    for name in weights_sum:
        weights_sum[name] /= num_agents

    return weights_sum




#define multiagent for initilization; training, and resetting weights
class MultiAgent:
    def __init__(self, policy_net, num_agents):#need revision; different envs
        self.agents = [copy.deepcopy(policy_net) for _ in range(num_agents)]
        self.agents = [model.to(device) for model in self.agents]
        self.agents = [nn.DataParallel(model) for model in self.agents]
        #self.agents = nn.DataParallel(self.agents)
        self.num_agents = num_agents
        self.optimizers = [optim.Adam(agent.parameters(), lr = LEARNING_RATE) for agent in self.agents]
        nenvs = []#set environemnts
        a = 1#hetergenoty parameter; a = 0 no hetergenoty; a larger --> hetergenoty larger
        for i in range(num_agents):
          curenv = gym.make('LunarLander-v2', wind_power=15.0+a*i)
          #curenv.seed(i)#different environments
          nenvs.append(curenv)
        self.envs = nenvs
    def reset_weights(self):#use the average to reset the weights;
        #in fact, average critic or actor = average all weights
        avg_weights = average_weights(self.agents)#obtain the weights
        #reset the weights as the average weights
        for agent in self.agents:
          for name, param in agent.named_parameters():
            param.data = avg_weights[name]
        #verify the correctness of the resetting
        for i, agent in enumerate(self.agents):
          for name, param in agent.named_parameters():
            assert torch.all(torch.eq(param.data, avg_weights[name])), f"Agent {i+1} is not initialized correctly"
            #print("Initialization successful!")

GPU is available!


In [None]:

#print(multiagents.optimizers[1])
policynet.apply(init_weights)

ActorCritic(
  (actor): MLP(
    (net): Sequential(
      (0): Linear(in_features=8, out_features=128, bias=True)
      (1): Dropout(p=0.1, inplace=False)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=128, out_features=128, bias=True)
      (4): Dropout(p=0.1, inplace=False)
      (5): PReLU(num_parameters=1)
      (6): Linear(in_features=128, out_features=4, bias=True)
    )
  )
  (critic): MLP(
    (net): Sequential(
      (0): Linear(in_features=8, out_features=128, bias=True)
      (1): Dropout(p=0.1, inplace=False)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=128, out_features=128, bias=True)
      (4): Dropout(p=0.1, inplace=False)
      (5): PReLU(num_parameters=1)
      (6): Linear(in_features=128, out_features=1, bias=True)
    )
  )
)

In [None]:
def trainall(M):
    multiagents = MultiAgent(policy_net = policynet, num_agents = 5)
    #print(multiagents.agents[0])
    multiagents.reset_weights()

    MAX_EPISODES = 400
    DISCOUNT_FACTOR = 0.99
    N_TRIALS = 25
    REWARD_THRESHOLD = 100
    PRINT_EVERY = 25

    #M = 5#update times for critic network

    train_rewards = []
    test_rewards = []

    train_rewards_actor = []
    train_rewards_critic = []

    for episode in range(1, MAX_EPISODES+1):

        #print("episode:", episode)
        #policy_loss, value_loss, train_reward = train(train_env, policy, optimizer, DISCOUNT_FACTOR)

        critic_rewards = 0
        for agent_id, model in enumerate(multiagents.agents):#critic training

          for i in range(M):

            value_loss, train_reward_critic = single_critic_train_v2(multiagents.envs[agent_id], model, multiagents.optimizers[agent_id], DISCOUNT_FACTOR)#train critic network
            #value_loss, train_reward_critic = single_critic_train(multiagents.envs[agent_id], model, multiagents.optimizers[agent_id], DISCOUNT_FACTOR)

            if (i == 0):#need revision; average of all envs
                critic_rewards = critic_rewards + train_reward_critic#record the average rewards

        train_rewards.append(critic_rewards/(agent_id+1))#average of all  agents using the average weights

        multiagents.reset_weights()#reset the weights

        for agent_id, model in enumerate(multiagents.agents):#actor training

          policy_loss, train_reward_actor = single_actor_train(multiagents.envs[agent_id], model, multiagents.optimizers[agent_id], DISCOUNT_FACTOR)#train actor network


        multiagents.reset_weights()#reset the weights


        #train_rewards_critic.append(train_reward_critic)

        #train_rewards_actor.append(train_reward_actor)

        #test_reward = evaluate(test_env, policy)

        #train_reward = evaluate(train_env, multiagents.agents[0])
        #test_reward = evaluate(test_env, multiagents.agents[0])

        #train_rewards.append(train_reward_actor)
        #test_rewards.append(test_reward)

        #mean_train_rewards = np.mean(train_rewards[-N_TRIALS:])
        #mean_test_rewards = np.mean(test_rewards[-N_TRIALS:])

        if episode % PRINT_EVERY == 0:
            mean_train_rewards = np.mean(train_rewards[-N_TRIALS:])
            #mean_test_rewards = np.mean(test_rewards[-N_TRIALS:])

            print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:7.1f} |')
            #print(f'| Episode: {episode:3} | Mean Test Rewards: {mean_test_rewards:7.1f} |')
            #mean_train_rewards_critic = np.mean(train_rewards_critic[-N_TRIALS:])
            #mean_train_rewards_actor = np.mean(train_rewards_actor[-N_TRIALS:])

            #print(f'| Episode: {episode:3} | Mean Train Rewards Critic: {mean_train_rewards_critic:7.1f} |')
            #print(f'| Episode: {episode:3} | Mean Train Rewards Actor: {mean_train_rewards_actor:7.1f} |')
            #print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:7.1f} | Mean Test Rewards: {mean_test_rewards:7.1f} |')
            #print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:7.1f} |')
        #if mean_test_rewards >= REWARD_THRESHOLD:

            #print(f'Reached reward threshold in {episode} episodes')

            #break
    return train_rewards

In [None]:
plt.figure(figsize=(12, 8))

# Assuming test_rewards is defined somewhere else and you want to plot it:
# plt.plot(test_rewards, label='Test Reward')

m = [2, 5, 10, 20]
totalrewards = []
for i in m:
    train_rewards = trainall(i)  # Assuming this function updates train_rewards based on the input i
    plt.plot(train_rewards, label=f'M = {i}')  # Correctly label each plot
    totalrewards.append(train_rewards)

plt.legend()  # To show the legend
plt.show()  # To display the plot

#plt.plot(train_rewards_actor, label='Train Reward')
plt.xlabel('Episode', fontsize=20)
plt.ylabel('Reward', fontsize=20)
plt.hlines(100, 0, len(train_rewards), color='r')
plt.legend(loc='lower right')
plt.grid()

In [None]:
MAX_EPISODES = 800
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 100
PRINT_EVERY = 25

M = 1#update times for critic network

train_rewards = []
test_rewards = []

train_rewards_actor = []
train_rewards_critic = []


for episode in range(1, MAX_EPISODES+1):

    #print("episode:", episode)
    #policy_loss, value_loss, train_reward = train(train_env, policy, optimizer, DISCOUNT_FACTOR)

    value_loss, train_reward_critic = single_critic_train_v2(train_env, policynet, optimizer, DISCOUNT_FACTOR)#train critic network

    policy_loss, train_reward_actor = single_actor_train(train_env, policynet, optimizer, DISCOUNT_FACTOR)#train actor network


    train_rewards.append(train_reward_actor)


    if episode % PRINT_EVERY == 0:
        mean_train_rewards = np.mean(train_rewards[-N_TRIALS:])

        print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:7.1f} |')
