ANN using gradient policy for training (Parallel training with multiple agents)

In [2]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions
import numpy as np
import gymnasium as gym  
import random

In [3]:
# Making the gymnasium environment
env = gym.make("CartPole-v1")

# Define the network
class policyNetwork(nn.Module):
    def __init__(self, inputDim, hiddenDim, outDim, dropOut):
        super().__init__()
        self.layer1 = nn.Linear(inputDim, hiddenDim)
        self.layer2 = nn.Linear(hiddenDim, outDim)
        self.dropout = nn.Dropout(dropOut)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x

# Calculate the cumulative returns in a specific trajectory
def cumulativeReturn(trajectoryRewards, gamma):
    # gamma is the discount factor
    
    returns = []
    __R = 0
    
    for reward in reversed(trajectoryRewards):
        __R = reward + gamma * __R
        returns.insert(0, __R)
    
    returns = torch.tensor(returns)
    
    # Normalize the returns
    return (returns - returns.mean())/(returns.std() + 1e-9)

def forwardPass(env, policyNetwork, gamma):
    log_probActions = []
    rewards = []
    done = False
    episodeReward = 0
    
    # Setup the environment
    policyNetwork.train()
    initialSeed = random.randint(1,1_000_000_000) # The random seed that determines the episode's I.C.
    state, _ = env.reset(seed = initialSeed)
    
    while not done:
        actionPred = policyNetwork(torch.Tensor(state).unsqueeze(0))
        actionProb = F.softmax(actionPred, dim = -1)
        dist = distributions.Categorical(actionProb)
        action = dist.sample()
        
        state, reward, terminated, truncated, info = env.step(action.item())
        done = terminated or truncated
        
        log_probActions.append(dist.log_prob(action))
        rewards.append(reward)
        episodeReward += reward
        
    log_probActions = torch.cat(log_probActions)
    trajectoryRewards = cumulativeReturn(rewards, gamma)
    
    return episodeReward, trajectoryRewards, log_probActions

def computeLoss(log_probActions, trajectoryRewards):
    return -(log_probActions * trajectoryRewards).sum()

def updatePolicyNetwork(log_probActions, trajectoryRewards, optimizer):
    __loss = computeLoss(log_probActions, trajectoryRewards.detach())
    
    optimizer.zero_grad()
    __loss.backward()
    optimizer.step()

In [None]:
NUM_ENVS = 7
MAX_EPOCHS = 500 # The total number of episodes taken by ALL agents, cumulatively
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 475
PRINT_INTERVAL = 10
INPUT_DIM = env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = env.action_space.n
DROPOUT = 0.5
MAX_EPISODE_TIME_STEP = 20


envs = gym.make_vec(
    "CartPole-v1", 
    num_envs = NUM_ENVS, # Number of environments to create
    vectorization_mode = "async",
    wrappers = (gym.wrappers.TimeAwareObservation,),
)
states = envs.reset()
envs.metadata["autoreset_mode"] = "DISABLED"
tmpVal = False


# Setup the environment
policy = policyNetwork(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)
optimizer = torch.optim.Adam(policy.parameters(), lr = 0.01)
policy.train()

# Numpy array to save timeStep data
logProbActions_Hist = torch.full((MAX_EPISODE_TIME_STEP, NUM_ENVS), torch.nan, dtype = torch.float32)
rewards_hist =  torch.full((MAX_EPISODE_TIME_STEP, NUM_ENVS), torch.nan, dtype = torch.float32)
agentTimeStep = torch.full((1, NUM_ENVS), -1, dtype = torch.int32).squeeze(0)
episodeReward = []
agentIndexMask = torch.arange(NUM_ENVS)

# Reset all environments
observations, _ = envs.reset()

# Start taking steps
overallTimeStep = 0
while True:    
    actionPred = policy(torch.Tensor(observations[:,:4]))
    actionProb = F.softmax(actionPred, dim = -1)
    dist = distributions.Categorical(actionProb)
    action = dist.sample()

    # Make a step and increase agents' timeStep
    observations, rewards, terminations, truncations, infos = envs.step(action.numpy())
    agentTimeStep += 1

    logProbActions_Hist[agentTimeStep, agentIndexMask] = dist.log_prob(action)
    rewards_hist[agentTimeStep, agentIndexMask] = torch.as_tensor(rewards, dtype = torch.float32)

    # Check to see if agents are terminated
    if ((terminations | truncations).any()):
        _terminated = np.where((terminations | truncations) == True)[0]
        print(f"Termination at {_terminated}")
        
        # Update the network for every terminated episode, also reset the timeStep
        for idx in _terminated:
            
            ########################
            # Update the network here
            ########################
            
            # Reset temporary parameters
            agentTimeStep[idx] = -1
            logProbActions_Hist[:, idx] = torch.nan
            rewards_hist[:, idx] = torch.nan

    # Assert the MAX_TIME_STEP condition in each episode
    if ((MAX_EPISODE_TIME_STEP -1 <= agentTimeStep).any()):
        _terminated = np.where((MAX_EPISODE_TIME_STEP - 1 <= agentTimeStep) == True)[0]
        
        # Update the network for every terminated episode, also reset the timeStep
        for idx in _terminated:
            
            ########################
            # Update the network here
            ########################
            
            
            # Reset temporary parameters
            agentTimeStep[idx] = -1
            logProbActions_Hist[:, idx] = torch.nan
            rewards_hist[:, idx] = torch.nan

    overallTimeStep += 1
    if overallTimeStep == 30 : break


Termination at [1]
Termination at [5]
Termination at [2 6]
Termination at [0]
Termination at [3]


In [431]:
print(agentTimeStep)
print(logProbActions_Hist)
print(rewards_hist)

tensor([ 6, 16, 12,  0,  9, 13, 12], dtype=torch.int32)
tensor([[-0.2920, -0.3923, -0.6880, -0.9979, -0.5084, -0.3833, -0.4005],
        [-1.0750, -1.0011, -0.9115,     nan, -0.9179, -0.8226, -0.4081],
        [-0.5846, -0.7765, -0.7151,     nan, -1.0343, -1.0120, -0.4890],
        [-0.4250, -0.5163, -0.8783,     nan, -0.9152, -0.5070, -0.7524],
        [-0.5369, -0.9509, -0.5596,     nan, -0.3865, -0.4636, -0.9579],
        [-0.4437, -0.5351, -0.9533,     nan, -0.7257, -0.4763, -0.5011],
        [-0.9083, -1.0588, -0.4375,     nan, -0.7246, -0.5281, -0.4407],
        [    nan, -0.7561, -0.4859,     nan, -0.7193, -0.6004, -0.7777],
        [    nan, -0.5023, -0.6837,     nan, -0.7922, -0.4180, -0.4974],
        [    nan, -0.4245, -0.5170,     nan, -0.5875, -1.0286, -0.4441],
        [    nan, -0.3972, -0.5325,     nan,     nan, -0.4093, -1.0277],
        [    nan, -0.7666, -1.0094,     nan,     nan, -0.5765, -0.8492],
        [    nan, -0.9232, -0.6858,     nan,     nan, -0.3679, -0.79

In [None]:
# Hyperparameters
MAX_EPOCHS = 500
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 475
PRINT_INTERVAL = 10
INPUT_DIM = env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = env.action_space.n
DROPOUT = 0.5

episodeReturns = []
policy = policyNetwork(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(policy.parameters(), lr = 0.01)

for episode in range(MAX_EPOCHS):
    episodeReward, trajectoryRewards, log_probActions = forwardPass(env, policy, DISCOUNT_FACTOR)
    updatePolicyNetwork(log_probActions, trajectoryRewards, optimizer)
    
    episodeReturns.append(episodeReward)
    mean_episode_return = np.mean(episodeReturns[-N_TRIALS:])

    if episode % PRINT_INTERVAL == 0:
        print(f'| Episode: {episode:3} | Mean Rewards: {mean_episode_return:5.1f} |')

    if mean_episode_return >= REWARD_THRESHOLD:
        print(f'Reached reward threshold in {episode} episodes')
        break
    
    break
    


AttributeError: module 'sympy.printing' has no attribute 'str'