## PPO Implementation from Scratch
NOTE: it is an early phases of training so it will look chaotic when running. That is normal and to be expected.

Additional Info: this model is working to train the spot mini mini quadruped in the pybullet simulated environment how to turn towards a randomly generated ball and walk towards it.

In [None]:
# importing all the necessary libraries
from spotmicro import spot_gym_env
import numpy as np
import pybullet as p
import pybullet_data
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions.normal as Normal



In [None]:
# defining the PPO memory class
# the PPO memory class is used to store the experience of the agent
# the experience is stored in the form of states, actions, rewards, dones, probs and vals
# honestly didn't end up using this class, but it is here for reference
class PPOMemory:

    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)
        self.actions.append(action)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

# defining the policy network
# the policy network is a neural network that takes the state as input
# and outputs the mean and standard deviation of the action distribution
class PolicyNet:
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim

        # defining the policy network as a sequential model
        # the policy network consists of three fully connected layers with ReLU activation functions
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)  # Output the mean of the action distribution
        )
        # log_std is a learnable parameter that represents the standard deviation of the action distribution
        # it is initialized to zero and will be learned during training
        self.log_std = nn.Parameter(torch.zeros(1, output_dim))

        # optimizer for the policy network
        # Adam optimizer is used for training the policy network
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    # defining the forward pass of the policy network
    def forward(self, state):
        mean = self.model(state)
        std = torch.exp(self.log_std)
        return mean, std

# defining the value network
# the value network is a neural network that takes the state as input
# and outputs the value of the state
class ValueNet:
    def __init__(self, input_dim):
        self.input_dim = input_dim

        # defining the value network as a sequential model
        # the value network consists of three fully connected layers with ReLU activation functions
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output a single value for the state value
        )

        # optimizer for the value network
        # Adam optimizer is used for training the value network
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    # defining the forward pass of the value network
    # the forward pass takes the state as input and outputs the value of the state
    def forward(self, state):
        return self.model(state)
    
# defining the PPO agent
# the PPO agent is responsible for interacting with the environment and learning from the experience
# the PPO agent uses the policy and value networks to choose actions and estimate the value of the states
class PPOAgent:
    def __init__(self, input_dim, output_dim, hyperparams):
        self.trajectory = {
            'states': [],
            'actions': [],
            'rewards': [],
            'dones': [],
            'probs': [],
            'vals': []
        }
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.policy = PolicyNet(input_dim, output_dim)
        self.value = ValueNet(input_dim)
        self.gamma = hyperparams['gamma']
        self.clip_ratio = hyperparams['epsilon']
        self.epochs = hyperparams['epochs']
        self.batch_size = hyperparams['batch_size']
        self.memory = PPOMemory(self.batch_size)

    # don't think i ended up using this function, but it is here for reference
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    # store the trajectory of the agent in the environment
    # the trajectory is a list of states, actions, rewards, dones, probs and vals
    def store_trajectory(self, state, action, probs, vals, reward, done):
        self.trajectory['states'].append(state)
        self.trajectory['actions'].append(action)
        self.trajectory['probs'].append(probs)
        self.trajectory['vals'].append(vals)
        self.trajectory['rewards'].append(reward)
        self.trajectory['dones'].append(done)

    # clear the trajectory after each episode
    # to avoid memory leaks and to start a new trajectory
    def clear_trajectory(self):
        self.trajectory['states'] = []
        self.trajectory['actions'] = []
        self.trajectory['probs'] = []
        self.trajectory['vals'] = []
        self.trajectory['rewards'] = []
        self.trajectory['dones'] = []

    # compute the discounted rewards for the trajectory
    # the discounted rewards are used to calculate the advantage of the actions taken by the agent
    # the advantage is the difference between the expected reward and the actual reward
    # the advantage is used to update the policy and value networks
    def compute_advantage(self):
        advantage = 0
        advantages = []

        for step in reversed(self.trajectory['rewards']):
            advantage+= self.gamma*step
            advantages.append(advantage)

        advantages.reverse()

        self.trajectory['advantages'] = advantages

    # where the agent chooses an action based on the current state
    # the action is sampled from a normal distribution with mean and std
    def choose_action(self, state):
        mean, std = self.policy.forward(state)
        dist = Normal.Normal(mean, std)
        action = dist.sample()
        prob = dist.log_prob(action).sum(dim=-1, keepdim=True).unsqueeze(0)
        val = self.value.forward(state).item()
        action = action.squeeze()
        action *= 0.5

        return action, prob, val
    
    # generating mini-batches of the trajectory
    # for training the policy and value networks
    def generate_batches(self):
        states = []
        actions = []
        rewards = []
        dones = []
        probs = []
        values = []
        advantages = []

        # shuffling the indices of the trajectory for mini-batch training
        # without actually shuffling the trajectory
        n_steps = len(self.trajectory['states'])
        indices = []
        for i in range(n_steps):
            indices.append(i)
        indices = np.array(indices, dtype=np.int64)
        np.random.shuffle(indices)

        # splitting the indices into mini-batches
        indices = np.array_split(indices, self.batch_size)

        # creating mini-batches of the trajectory
        for batch in indices:
            states.append([self.trajectory['states'][i] for i in batch])
            actions.append([self.trajectory['actions'][i] for i in batch])
            rewards.append([self.trajectory['rewards'][i] for i in batch])
            dones.append([self.trajectory['dones'][i] for i in batch])
            probs.append([self.trajectory['probs'][i] for i in batch])
            values.append([self.trajectory['vals'][i] for i in batch])
            advantages.append([self.trajectory['advantages'][i] for i in batch])

        return states, actions, rewards, dones, probs, values, advantages
        
    # training the policy and value networks using the mini-batches of the trajectory
    # the training is done using the PPO algorithm
    def learn(self):
        self.compute_advantage()

        for epoch in range(self.epochs):

            # generating mini-batches of the trajectory
            states, actions, _, _, probs, _, advantages = self.generate_batches()

            for batch in range(len(states)):
                states_mini_batch = states[batch]
                probs_mini_batch = probs[batch]
                advantages_mini_batch = advantages[batch]
            

                losses = []
                values = []

                for i in range(len(states_mini_batch)):
                    state = states_mini_batch[i]
                    mean, std = self.policy.forward(state)
                    dist = Normal.Normal(mean, std)

                    # entropy controls the exploration of the agent
                    # the higher the entropy, the more exploration
                    entropy = dist.entropy().mean()
                    action = dist.sample()
                    action = action.squeeze()
                    action *= 0.5
                    new_prob = dist.log_prob(action).sum(dim=-1, keepdim=True)

                    # calculate the ratio of the new and old probabilities
                    ratio = torch.exp(new_prob - probs_mini_batch[i])

                    # calculate the surrogate loss
                    surrogate_loss = ratio * advantages_mini_batch[i]
                    clipped_ratio = torch.clamp(ratio, 1-self.clip_ratio, 1+self.clip_ratio)
                    loss = torch.min(torch.tensor([surrogate_loss, clipped_ratio * advantages_mini_batch[i]]))
                    losses.append(loss)

                    # calculate the value loss
                    value = self.value.forward(state)
                    values.append(value)

                # taking the mean of the losses and values
                mean_loss = torch.mean(torch.tensor(losses, requires_grad=True))
                policy_loss = -mean_loss
                value_loss = torch.mean((torch.tensor(values, requires_grad=True) - torch.tensor(advantages_mini_batch)) ** 2)
                #total_loss = policy_loss + 0.5*value_loss - 0.01*entropy

                # update the policy network
                self.policy.optimizer.zero_grad()
                policy_loss.backward()
                self.policy.optimizer.step()
                
                # update the value network
                self.value.optimizer.zero_grad()
                value_loss.backward()
                self.value.optimizer.step()


In [None]:
# defining hyperparameters
hyperparams = {
    'learning_rate': 0.001,
    'gamma': 0.9,
    'epsilon': 0.2,
    'epochs': 5,
    'batch_size': 64,
}

# initialize the env
env = spot_gym_env.spotGymEnv()
env.render(mode='human')
env.reset()
env.seed(0)

# initialize the agent
agent = PPOAgent(env.observation_space.shape[0], env.action_space.shape[0], hyperparams)

# training the agent
# the more episodes the better the agent will perform
# we're doing 200 episodes here
for episode in range(1, 200):
    # getting the initial state of the environment
    # and converting it to a tensor
    obs = env.reset()
    obs = torch.tensor(obs, dtype=torch.float)
    done = False
    score = 0
    step = 0
    agent.clear_trajectory()

    # running the agent in the environment
    # the agent will take actions in the environment until it reaches the max steps
    while not done:
        action, prob, val = agent.choose_action(obs)
        next_obs, reward, done, _ = env.step(action)
        next_obs = torch.tensor(next_obs, dtype=torch.float)
        
        # storing the trajectory of the agent, which includes all the info gained from the step in the environment
        # this will be used to update the policy and value networks
        agent.store_trajectory(obs, action, prob, val, reward, done)

        score += reward

        # setting the next observation as the current observation
        obs = next_obs

    # learning from the trajectory of the agent
    agent.learn()
    print(f"Episode: {episode}, Score: {score}")
