In [195]:
## Importing the correct packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mujoco
import gymnasium as gym

import random
import torch
import torch.nn as nn
from torch.optim import Adam
import tqdm
import math
import os
import copy


# for copying deep nets to another variable
from copy import deepcopy

# library for ou noise as implemented with the paper
from ou_noise import ou

# to view model summary
from torchsummary import summary

# queue for replay buffer
from collections import deque

In [329]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        # initialize parameters
        self.buffer_size = buffer_size
        self.buffer = deque(maxlen=self.buffer_size)

    def insert(self, obs, action, reward, next_obs, done):
        # tuple to represent transition
        trans = (torch.tensor(obs, dtype=torch.float32), torch.tensor(action, dtype=torch.float32),
                 torch.tensor(reward, dtype=torch.float32), torch.tensor(next_obs, dtype=torch.float32), torch.tensor(done, dtype=torch.float32))

        # save transition to buffer
        # use deque because once its full it discards old items
        self.buffer.append(trans)

    def sample_random_minibatch(self, batch_size, device):
        # Random idx to sample from buffer w/o replacement
        batch = random.sample(self.buffer, batch_size)

        # Unpack batch into separate lists of tensors
        obs, actions, rewards, next_obs, dones = zip(*batch)
        
        # Convert lists of tensors into single tensors
        obs = torch.stack(obs).to(device)
        actions = torch.stack(actions).to(device)
        rewards = torch.stack(rewards).to(device)
        next_obs = torch.stack(next_obs).to(device)
        dones = torch.stack(dones).to(device)

        # # convert list to tensor for easy slciing
        # batch = torch.tensor(batch)

        # # slicing to grab elements
        # obs = batch[:,0]
        # actions =  batch[:,1]
        # rewards = batch[:,2]
        # next_obs = batch[:,3]
        # dones = batch[:,4]

        # tuple of tensors
        batch = (obs, actions, rewards, next_obs, dones)

        return batch
    
    # def prepopulate(self, env, actor):
    #     # select action from actor
    #     # execute action in the env
    #     # store transition

    #     # intialize state
    #     state,_ = env.reset()

    #     # loop through num_steps
    #     for i in range(self.buffer_size):
    #         # choose random action from environment action space (random policy)
    #         action = env.action_space.sample()
    #         # take action: get next state, reward, done
    #         next_state, reward, done, truncate,_ = env.step(action)
    #         # add transition to memory
    #         self.insert(state, action, reward, next_state, done)

    #         # update state
    #         if done or truncate:
    #             # if truncation reached, reset state
    #             state,_ = env.reset()
    #         else:
    #             # all else state is the next
    #             state = next_state

In [330]:
# TESTING REPLAY BUFFER
# test = ReplayBuffer(5)
# test.insert(1,1.0,5,2,True)
# test.insert(1,1.0,5,2,True)
# test.insert(1,1.0,5,2,True)
# test.insert(1,1.0,5,2,True)
# test.insert(1,1.0,5,2,True)
# test.insert(1,1.0,5,2,False)

# sample = test.sample_random_minibatch(3)
# print(sample)

In [331]:
# Actor AKA: The POLICY
class Actor(nn.Module):
    def __init__(self, num_states, num_actions, hidden_dims=(400,300), init_weight = 3e3) -> None:
        super(Actor, self).__init__()
        # In the DDPG paper the parameters for the ACTOR are:
        # - Learning rate: 10^-4
        # - 2 hidden layers
        # - 400 & 300 hidden dims (called units in paper) for first and second hidden layer, respectively
        # - ReLU (rectified nonlinearity) for all hidden layers
        # - output layer uses tanh (returns actions needed for the agent)

        # initializing layer weights
        # - hidden layers weights iniitalized with uniform distribution (-1/sqrt(fan_in), 1/sqrt(fan_in)); fan_in being the input of that particular layer
        # - output layer weights initialized with uniform distribution (-3e-3,3e-3)


        self.init_weight_limit = init_weight

        # hidden layers
        self.hidden1 = nn.Linear(num_states, hidden_dims[0]) # input to hidden
        self.hidden2 = nn.Linear(hidden_dims[0], hidden_dims[1]) # hidden to hidden
        # output layer
        self.output = nn.Linear(hidden_dims[1], num_actions) # hidden to output
        # activation functions
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # initialize weights
        self.init_weights()
    
    def forward(self, x):
        # input to first hidden layer w/ relu activation
        x = self.hidden1(x)
        x = self.relu(x)

        # feed into second hidden layer w/ relu activation
        x = self.hidden2(x)
        x = self.relu(x)
        
        # feed through output layer w/ tanh activation
        x = self.output(x)
        y = self.tanh(x)

        return y
    
    def init_weights(self):
        # init hidden with uniform distribution (-1/sqrt(fan_in), 1/sqrt(fan_in)); fan_in being the input of that particular layer
        self.hidden1.weight.data.uniform_(-(1/math.sqrt(self.hidden1.weight.size(1))),(1/math.sqrt(self.hidden1.weight.size(1))))
        self.hidden2.weight.data.uniform_(-(1/math.sqrt(self.hidden2.weight.size(1))),(1/math.sqrt(self.hidden2.weight.size(1))))
        # output layer weights init with uniform distribution (-3e-3,3e-3)
        self.output.weight.data.uniform_(-self.init_weight_limit, self.init_weight_limit)


In [332]:
# Critic AKA: The Q-VALUE FUNCTION
class Critic(nn.Module):
    def __init__(self, num_states, num_actions, output_dim=1, hidden_dims=(400,300), init_weight = 3e3) -> None:
        super(Critic, self).__init__()
        # In the DDPG paper the parameters for the CRITIC are:
        # - Learning rate: 10^-3
        # - 2 hidden layers
        # - 400 & 300 hidden dims (called units in paper) for first and second hidden layer, respectively
        # - ReLU (rectified nonlinearity) for all hidden layers
        # - output layer uses tanh (returns a single q-value for the input state-action pair)
        # - output layer weights initialized with uniform distribution (low=-3e-3,high=3e-3)

        # initializing layer weights
        # - hidden layers weights iniitalized with uniform distribution (-1/sqrt(fan_in), 1/sqrt(fan_in)); fan_in being the input of that particular layer
        # - output layer weights initialized with uniform distribution (-3e-3,3e-3)

        self.init_weight_limit = init_weight

        # hidden layers
        self.hidden1 = nn.Linear(num_states, hidden_dims[0]) # input to hidden, nn.Linear are the next layers after the given input x
        self.hidden2 = nn.Linear(hidden_dims[0]+num_actions, hidden_dims[1]) # hidden to hidden
        # output layer
        self.output = nn.Linear(hidden_dims[1], output_dim) # hidden to output
        # activation functions
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # initialize weights
        self.init_weights()

    def forward(self, x):
        # pull state and action from input
        state, action = x

        # first hidden layer and relu activation
        x = self.hidden1(state)
        x = self.relu(x)

        # in critic (Q-value fn) network, the actions are not included until the second hidden layer
        # feed thru w/ relu activation
        x = self.hidden2(torch.cat([x,action],1))
        x = self.relu(x)
        
        # feed through output layer w/ tanh activation
        x = self.output(x)
        y = self.tanh(x)

        return y
    
    def init_weights(self):
        # init hidden with uniform distribution (-1/sqrt(fan_in), 1/sqrt(fan_in)); fan_in being the input of that particular layer
        # alternative method: nn.init.uniform_(self.hidden1.weight, a=-(1/math.sqrt(self.hidden1.weight.size(1))), b=(1/math.sqrt(self.hidden1.weight.size(1))))
        self.hidden1.weight.data.uniform_(-(1/math.sqrt(self.hidden1.weight.size(1))),(1/math.sqrt(self.hidden1.weight.size(1))))
        self.hidden2.weight.data.uniform_(-(1/math.sqrt(self.hidden2.weight.size(1))),(1/math.sqrt(self.hidden2.weight.size(1))))
        # output layer weights init with uniform distribution (-3e-3,3e-3)
        self.output.weight.data.uniform_(-self.init_weight_limit, self.init_weight_limit)


In [333]:
# [reference] https://github.com/udacity/deep-reinforcement-learning/blob/master/ddpg-bipedal/ddpg_agent.py
# Used Udacity tutorial for OU noise generation
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return torch.tensor(self.state, dtype=torch.float32)

In [None]:
class DDPGAgent:
    def __init__(self, env, params, random_seed) -> None:
        # grabbing parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.actor_lr = params['actor_lr']
        self.critic_lr = params['critic_lr']
        self.batch_size = params['minibatch_size']
        self.buffer_size = params['replay_buffer_size']

        # setting random seed
        self.seed = random.seed(random_seed)

        # setting number of states and actions
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        # choose device
        self.device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"Using {self.device} device")

        # initialize critic network w target
        self.critic = Critic(self.num_states, self.num_actions).to(self.device)
        # summary(self.critic, input_size=(2))
        # creating deepcopy to copy the network over to a target
        self.critic_target = deepcopy(self.critic)
        # define optimizer
        self.critic_optim = Adam(self.critic.parameters(), lr=self.critic_lr)

        # initialize actor network w target
        self.actor = Actor(self.num_states, self.num_actions).to(self.device)
        # summary(self.actor, input_size=(11,))
        # creating deepcopy to copy the network over to a target
        self.actor_target = deepcopy(self.actor)
        # define optimizer
        self.actor_optim = Adam(self.actor.parameters(), lr=self.actor_lr)

        # OU noise for action selection
        self.noise = OUNoise(self.num_actions, random_seed)

        # initialize replay buffer and prepopulate
        self.replay_buffer = ReplayBuffer(self.buffer_size)
        # self.replay_buffer.prepopulate()

    # get action with some noise
    def get_action(self, state):
        # convert to tensor to feed into network
        state = torch.tensor(state, dtype=torch.float32).to(self.device)

        # set to eval mode to not track batch norm
        self.actor.eval()

        with torch.no_grad():
            action = self.actor(state)
            action += self.noise.sample()
        self.actor.train()
        
        return action.numpy()
    
    # updates critic and actor
    def update(self):
        # sample batch
        state_batch, action_batch, reward_batch, next_state_batch, dones_batch = self.replay_buffer.sample_random_minibatch(self.batch_size, self.device)

        # calculate target batch
        with torch.no_grad():
            target = self.calculate_target(reward_batch, next_state_batch)

        # calculate q-value batch
        q_val_batch = self.critic((state_batch, action_batch))

        # update critic by minimizing loss
        loss = nn.MSELoss()

        self.critic_optim.zero_grad()
        loss_val = loss(q_val_batch, target)
        loss_val.backward()        
        self.critic_optim.step()
        
        # using critic to update actor
        loss_actor = -self.critic((state_batch, self.actor(state_batch))) # TODO: should this be negative?
        
        self.actor.zero_grad()
        loss_actor = torch.mean(loss_actor)
        loss_actor.backward()
        self.actor_optim.step()

        # update target network weights
        # update target critic
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - self.tau) + param.data * self.tau
            )
        
        # update target actor
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - self.tau) + param.data * self.tau
            )

        return loss_actor.item()

    def calculate_target(self, reward, next_state):
        next_action = self.actor_target(next_state)
        target = reward + self.gamma*self.critic_target((next_state, next_action))
        return target

In [None]:
# TODO: choose device to run on
# main loop
def run_DDPG():
    # initialize parameters
    params = {'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'tau': 0.001,
            'gamma': 0.99,
            'minibatch_size': 64,
            'replay_buffer_size': int(10e6),
            'steps': 100_000}

    # grab cwd for model saving
    cwd = os.getcwd()
    
    # create environment
    env = gym.make('Hopper-v5')

    # ddpg object
    ddpg = DDPGAgent(env, params, random_seed=10)

    # keep track of loss
    scores = []
    scores_deque = deque(maxlen=100)
    loss = []

    # loop through desired number of steps
    for ep in tqdm.tqdm(range(params['steps']), desc="steps"):
        # reset env
        state,_ = env.reset()

        # initialize terminal state
        done = False

        # track cumulative reward
        cumulative_reward = 0

        # while environment isnt terminal
        while not done:
            # grab action with OU noise
            action = ddpg.get_action(state)
            # execute action in env
            next_state, reward, done, truncate, _ = env.step(action)
            # store in buffer
            ddpg.replay_buffer.insert(state, action, reward, next_state, done)

            # learn when buffer reaches batch size
            if len(ddpg.replay_buffer.buffer) > ddpg.batch_size:
                loss_item = ddpg.update()
                loss.append(loss_item)

            # update state
            state = next_state

            # update cumulative reward
            cumulative_reward += reward

        # append to running score and to score deque for average reward approximation
        scores.append(cumulative_reward)
        scores_deque.append(cumulative_reward)

        # save scores
        if ep % 1000 == 0:
            torch.save(ddpg.actor.state_dict(), cwd+'/checkpoint_actor.pth')
            torch.save(ddpg.critic.state_dict(), cwd+'/checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(ep, np.mean(scores_deque)))   
        
        # reset env if done
        env.reset()

    
    return scores, loss

scores, loss = run_DDPG()

plt.plot(np.arange(1,len(scores)+1), scores)
plt.ylabel('Cumulative reward')
plt.xlabel('Episodes')
plt.show()





Using cpu device


steps:   0%|          | 0/100000 [00:00<?, ?it/s]

[ 1.24741459e+00  1.30447929e-03 -4.92656566e-03 -2.01105469e-03
 -1.92940047e-03 -1.78545562e-03  2.63571942e-03 -4.79845063e-03
 -1.22794558e-03 -3.66053091e-03 -4.49768400e-03]


  return F.mse_loss(input, target, reduction=self.reduction)
steps:   0%|          | 1/100000 [00:00<7:43:28,  3.60it/s]

Episode 0	Average Score: 43.09
[ 1.25120947e+00 -4.38383478e-04  1.33654910e-03  3.48032333e-03
  1.05385943e-04  1.95775182e-03 -3.02278952e-03  4.81072188e-03
  3.21575717e-03 -4.76749248e-03 -3.97170813e-03]


steps:   0%|          | 2/100000 [00:01<16:55:53,  1.64it/s]

[ 1.25008076e+00  7.95658283e-04 -1.72570740e-04  1.72518823e-03
 -1.73637295e-03 -4.96398979e-03  1.29146062e-03 -3.41471980e-04
 -3.45857167e-04 -1.23128617e-03 -2.06952534e-03]


steps:   0%|          | 3/100000 [00:01<16:00:05,  1.74it/s]

[ 1.25112486e+00  9.29236930e-04 -4.16233536e-03  4.12463023e-03
 -2.67299489e-03  1.16860897e-03 -4.64126375e-03 -2.69559180e-03
 -2.65172142e-03 -3.65637347e-03 -2.74474008e-03]


steps:   0%|          | 4/100000 [00:02<16:16:14,  1.71it/s]

[ 1.24929492e+00 -1.75693815e-03  1.81950674e-03 -9.52513946e-04
  4.04060838e-03 -3.68583759e-03 -2.98789029e-03  1.02043315e-03
  3.16876547e-03  4.89361625e-03  2.97000846e-03]


steps:   0%|          | 5/100000 [00:03<19:01:43,  1.46it/s]

[ 1.25128424e+00  2.29820107e-03  3.61468411e-03 -7.40537157e-04
 -7.29177068e-04 -3.15461095e-03  5.07596169e-04  3.59385689e-03
  4.24763432e-04  2.64644477e-03 -1.62530329e-03]


steps:   0%|          | 6/100000 [00:04<22:07:45,  1.26it/s]

[ 1.25053708e+00  1.37449386e-04 -8.04071235e-04  4.52189303e-03
  4.80515436e-03  3.85165671e-03  5.70193836e-04  1.38848416e-03
 -4.61426268e-03 -3.43437194e-03 -1.92962610e-03]


steps:   0%|          | 7/100000 [00:04<20:20:29,  1.37it/s]

[ 1.25310249e+00  3.37176982e-03 -1.13657700e-03  7.03040170e-04
  2.86024392e-03  2.61839314e-03 -4.09741336e-04  3.12070183e-03
  1.11337728e-03  1.57001179e-03 -3.68992725e-03]


steps:   0%|          | 8/100000 [00:05<16:59:04,  1.64it/s]

[ 1.24711921e+00  1.64918037e-03 -3.39727831e-03 -3.16555009e-03
  2.28718996e-03  3.10086720e-04  2.12679539e-03 -2.92036291e-03
  2.89923706e-03  2.33714590e-04 -4.50015094e-03]


steps:   0%|          | 9/100000 [00:05<19:35:00,  1.42it/s]

[ 1.25087134e+00 -9.05095140e-04 -3.83651796e-03  1.23659697e-03
 -3.11301594e-03 -1.32221696e-03 -1.34791276e-04  1.14922832e-03
  1.33429802e-03 -8.84357820e-04 -1.01501416e-03]


steps:   0%|          | 10/100000 [00:06<17:31:37,  1.58it/s]

[ 1.25050828e+00 -3.41413022e-03 -1.98758726e-03 -8.10437979e-04
 -8.63284574e-04 -1.26011879e-03  4.64500350e-03 -1.41954314e-03
 -1.64863349e-04 -1.81600242e-03 -2.32262367e-03]


steps:   0%|          | 11/100000 [00:07<19:13:30,  1.44it/s]

[ 1.24572263e+00  3.95313634e-04  2.15244303e-03  4.16451125e-03
 -1.91360290e-03 -1.41157000e-03  3.30800008e-04 -1.86824070e-03
  1.93920319e-03  3.64083463e-03 -3.03394776e-03]


steps:   0%|          | 12/100000 [00:08<19:28:30,  1.43it/s]

[ 1.25440081e+00 -3.78372126e-03  1.68362922e-04 -4.79548624e-03
  1.90957420e-03 -4.57937292e-03  3.14908678e-03 -3.61653651e-03
  3.80232004e-03  1.40278713e-03  1.08893894e-03]


steps:   0%|          | 13/100000 [00:08<21:22:03,  1.30it/s]

[ 1.24733964e+00 -1.01430691e-03 -2.36001984e-03 -3.69522718e-04
 -2.34978528e-04  4.63250799e-03  3.10137337e-03  4.09544628e-04
  1.41364731e-03  3.15942659e-03 -2.14530199e-03]


steps:   0%|          | 14/100000 [00:09<22:43:34,  1.22it/s]

[ 1.24577789e+00  4.66714513e-03 -7.85593834e-04 -2.12533789e-03
 -4.89334169e-04  3.78541397e-03 -3.19381430e-03  1.11000312e-03
 -3.72353602e-03  8.57812778e-04 -1.27305436e-03]


steps:   0%|          | 15/100000 [00:11<25:37:02,  1.08it/s]

[ 1.24934296e+00 -3.34615185e-03  1.87466469e-03 -3.79202657e-03
  5.28321871e-04 -4.63464006e-03 -4.15048083e-03  2.96964616e-03
 -4.96372480e-03 -1.25364324e-03 -3.06635361e-03]


steps:   0%|          | 16/100000 [00:11<22:04:02,  1.26it/s]

[ 1.24680134e+00 -3.67071391e-03 -3.13281795e-03 -6.72942102e-04
 -1.39560286e-03  2.82283133e-03  4.52942585e-03 -8.39546117e-04
  2.12055366e-03 -4.79089033e-03 -1.10745060e-03]


steps:   0%|          | 17/100000 [00:12<23:06:07,  1.20it/s]

[ 1.25109252e+00  2.04211236e-03  2.71203303e-03 -4.01809607e-04
  3.01540325e-03 -2.02564967e-03 -4.34757062e-03  3.99504941e-03
  6.40905755e-04  2.68145601e-03 -1.54896481e-03]


steps:   0%|          | 17/100000 [00:13<21:34:24,  1.29it/s]


KeyboardInterrupt: 

In [354]:
# grab cwd for model saving
cwd = os.getcwd()

# instantiate env
env = gym.make('Hopper-v5', render_mode='human')

# initialize parameters
params = {'actor_lr': 0.0001,
        'critic_lr': 0.001,
        'tau': 0.001,
        'gamma': 0.99,
        'minibatch_size': 64,
        'replay_buffer_size': int(10e6),
        'steps': 100_000}

ddpg = DDPGAgent(env, params, random_seed=10)

ddpg.actor.load_state_dict(torch.load(cwd+'/checkpoint_actor.pth'))
ddpg.critic.load_state_dict(torch.load(cwd+'/checkpoint_critic.pth'))

state,_ = env.reset()  
while True:
    action = ddpg.get_action(state)
    env.render()
    next_state, reward, done, truncate, _ = env.step(action)
    state = next_state
    if done:
        break
        
env.close()

Using cpu device


c:\Users\aaron\AppData\Local\Programs\Python\Python312\Lib\site-packages\glfw\__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'
