ANN using gradient policy for training (Training with single agent)

In [5]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions
import numpy as np
import gymnasium as gym  
import random
import time

In [3]:
# Making the gymnasium environment
env = gym.make("CartPole-v1")

# Define the network
class policyNetwork(nn.Module):
    def __init__(self, inputDim, hiddenDim, outDim, dropOut):
        super().__init__()
        self.layer1 = nn.Linear(inputDim, hiddenDim)
        self.layer2 = nn.Linear(hiddenDim, outDim)
        self.dropout = nn.Dropout(dropOut)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x

# Calculate the cumulative returns in a specific trajectory
def cumulativeReturn(trajectoryRewards, gamma):
    # gamma is the discount factor
    
    returns = []
    __R = 0
    
    for reward in reversed(trajectoryRewards):
        __R = reward + gamma * __R
        returns.insert(0, __R)
    
    returns = torch.tensor(returns)
    
    # Normalize the returns
    return (returns - returns.mean())/returns.std()

def forwardPass(env, policyNetwork, gamma):
    log_probActions = []
    rewards = []
    done = False
    episodeReward = 0
    
    # Setup the environment
    policyNetwork.train()
    initialSeed = random.randint(1,1_000_000_000) # The random seed that determines the episode's I.C.
    state, _ = env.reset(seed = initialSeed)
    
    while not done:
        actionPred = policyNetwork(torch.Tensor(state).unsqueeze(0))
        actionProb = F.softmax(actionPred, dim = -1)
        dist = distributions.Categorical(actionProb)
        action = dist.sample()
        
        state, reward, terminated, truncated, info = env.step(action.item())
        done = terminated or truncated
        
        log_probActions.append(dist.log_prob(action))
        rewards.append(reward)
        episodeReward += reward
        
    log_probActions = torch.cat(log_probActions)
    trajectoryRewards = cumulativeReturn(rewards, gamma)
    
    return episodeReward, trajectoryRewards, log_probActions

def computeLoss(log_probActions, trajectoryRewards):
    return -(log_probActions * trajectoryRewards).sum()

def updatePolicyNetwork(log_probActions, trajectoryRewards, optimizer):
    __loss = computeLoss(log_probActions, trajectoryRewards.detach())
    
    optimizer.zero_grad()
    __loss.backward()
    optimizer.step()

In [6]:
# Hyperparameters
MAX_EPOCHS = 5000
DISCOUNT_FACTOR = 0.99
N_TRIALS = 25
REWARD_THRESHOLD = 200
PRINT_INTERVAL = 10
INPUT_DIM = env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = env.action_space.n
DROPOUT = 0.5
overallTimeStep = 0

episodeReturns = []
policy = policyNetwork(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(policy.parameters(), lr = 0.01)

_startTime = time.time()
for episode in range(MAX_EPOCHS):
    episodeReward, trajectoryRewards, log_probActions = forwardPass(env, policy, DISCOUNT_FACTOR)
    updatePolicyNetwork(log_probActions, trajectoryRewards, optimizer)
    
    episodeReturns.append(episodeReward)
    mean_episode_return = np.mean(episodeReturns[-N_TRIALS:])

    if episode % PRINT_INTERVAL == 0:
        print(f'| Episode: {episode:3} | Mean Rewards: {mean_episode_return:5.1f} | tps: {overallTimeStep/(time.time()-_startTime)}')

    if mean_episode_return >= REWARD_THRESHOLD:
        print(f'Reached reward threshold in {episode} episodes')
        break
    overallTimeStep += len(trajectoryRewards)

| Episode:   0 | Mean Rewards:  38.0 | tps: 0.0
| Episode:  10 | Mean Rewards:  18.6 | tps: 1032.2217368566771
| Episode:  20 | Mean Rewards:  34.0 | tps: 1142.3422615887928
| Episode:  30 | Mean Rewards:  47.8 | tps: 1169.434118242
| Episode:  40 | Mean Rewards:  51.5 | tps: 1163.206235116598
| Episode:  50 | Mean Rewards:  49.6 | tps: 1166.435724696814
| Episode:  60 | Mean Rewards:  50.9 | tps: 1167.669059707102
| Episode:  70 | Mean Rewards:  62.6 | tps: 1173.7855120975498
| Episode:  80 | Mean Rewards: 115.8 | tps: 1129.816121697737
| Episode:  90 | Mean Rewards: 143.9 | tps: 1231.0990144816208


KeyboardInterrupt: 