## PPO Implementation Tutorial for Reinforcement Learning
Implementating reinforcement learning utilizing the PPO principle from a tutorial so that I can gain a better understanding of how this works to implement it in our final project.

Tutorial is very long, and I am taking as much time to fully understand the concepts.
By using a jupyter notebook, I can take notes while also checking that everything compiles as I work through the tutorial.

In [4]:
# setting up neural network module
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

class FeedForwardNN(nn.Module):
    def __init__(self):
        super(FeedForwardNN, self).__init__()

    # defining neural network laers
    def __init__(self, in_dim, out_dim):
        super(FeedForwardNN, self).__init__()
        self.layer1 = nn.Liner(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)

    # the network module defines our actor and critic
    # will take in an observation adn return either an action or a value
    def forward(self, obs):
        # convert observation to tensor if it's a numpy array
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float)
        
        activation1 = F.relu(self.layer1(obs))
        activation2 = F.relu(self.layer2(activation1))
        output = self.layer3(activation2)

        return output

In [5]:
import torch
from torch.distributions import MultivariateNormal
from network import FeedForwardNN

class PPO:
    def __init__(self, env):
        #extract environment information
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]

        # initialize actor and critic networks
        self.actor = FeedForwardNN(self.obs_dim, self.act_dim)
        self.critic = FeedForwardNN(self.obs_dim, 1)

        self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
        self.cov_mat = torch.diag(self.cov_var)

    def rollout(self):
        # batch data
        batch_obs = []
        batch_acts = []
        batch_log_probs = []
        batch_rews = []
        batch_rtgs = []
        batch_lens = []

        return batch_obs, batch_acts, batch_log_probs, batch_rews, batch_rtgs, batch_lens

    def learn(self, total_timesteps):
        t_so_far = 0
        while t_so_far < total_timesteps:
            batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout()