## Import everything that we need here

In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Using cuda
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

print(use_cuda)
print(device)

False
cpu


## Create Environments

In [4]:
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v1"

def make_env():
    def _thunk():
        env = gym.make(env_name, render_mode="human")
        return env 
    return _thunk()

envs = [make_env() for i in range(num_envs)]
#envs = SubprocVecEnv(envs)

env = gym.make(env_name, render_mode="rgb_array")

In [5]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>

## Defining the NN

In [6]:
def init_weights(m):
    # print(f"This is m : {m}")
    # print(f"This is the type of m : {type(m)}")
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
    def __init__(
            self,
            num_inputs,
            num_outputs,
            hidden_size,
            std=0.0
    ):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )

        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)

        self.apply(init_weights)

    def forward(
            self,
            x
    ):
        value = self.critic(x)
        mu = self.actor(x)
        std = self.log_std.exp().expand_as(mu)
        dist = Normal(mu, std)
        return dist, value

In [7]:
num_input = env.observation_space.shape[0]
num_output = env.action_space.shape[0]

print(num_input, num_output)

hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200

model = ActorCritic(num_input, num_output, hidden_size).to(device)

for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

optimizer = optim.Adam(model.parameters(), lr=lr)

3 1
log_std: torch.Size([1, 1])
critic.0.weight: torch.Size([256, 3])
critic.0.bias: torch.Size([256])
critic.2.weight: torch.Size([1, 256])
critic.2.bias: torch.Size([1])
actor.0.weight: torch.Size([256, 3])
actor.0.bias: torch.Size([256])
actor.2.weight: torch.Size([1, 256])
actor.2.bias: torch.Size([1])


In [16]:
def collect_experience(
    env,
    model,
    num_steps
):
    pass

    frames = []
    rewards = []
    actions = []
    values  = []
    masks = []
    states = []
    log_probs = []
    entropy = 0

    state, info = env.reset()

    for step in range(num_steps):
        
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _, _ = env.step(action.cpu().numpy()) #if you pass in action.cpy().numpy()[0] we will get a scalar value or single value tuple (3,)

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        states.append(state)
        actions.append(action)
        
        state = np.array(next_state).flatten()

        print('-' * 50)
        print(f"Step : {step}")
        print(f"Current State : {state}")
        print(f"Currrent Action : {action}")
        print(f"Next State : {next_state}")
        
        # The interesting part is how the reward is calculated
        # reward is -(angle_cost + velocity_cost + action_cost)

        # angle_cost = angle**2                      Penalty for being away from upright (0°)
        # velocity_cost = 0.1 * angular_velocity**2  Penalty for moving too fast
        # action_cost = 0.001 * action**2            Penalty for using large torques
        print(f"Reward : {reward}")
        
        #frame = env.render()
        #frames.append(frame)
    
    return (
        frames,
        rewards,
        actions,
        values,
        masks,
        states,
        log_probs,
        entropy,
        next_state
    )



In [36]:
# Generalized advtange exstimate is the total advantage that is expected at any particular time. Advtange is the reward that will be used to train the RL network
def compute_gae(next_value, 
            rewards,
            values, 
            masks,
            gamma=0.9, 
            tau=0.95
        ):
    
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):

        if masks[step] == 0:
            gae = rewards[step]
        else:
            gae = rewards[step] + gamma * (tau * gae + (1 - tau) * values[step + 1])

        returns.insert(0, gae)
    
    return returns

In [37]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

def test_env(
    env,
    model,
    total_steps = 100,
    vis=False,
):
    
    frames, rewards, actions, values, masks, states, log_probs, entropy, next_state = collect_experience(
        env,
        model,
        total_steps
    )

    next_state = np.array(next_state).flatten()
    total_reward = sum(rewards)

    next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)

    return (
        total_reward,
        returns
    )

# This is how the environment works 
final_reward,returns = test_env(
    env,
    model,
    vis=True
)

print(final_reward)
print(returns)

--------------------------------------------------
Step : 0
Current State : [-0.9914563  -0.13043961  0.55495125]
Currrent Action : tensor([[1.2146]])
Next State : [[-0.9914563 ]
 [-0.13043961]
 [ 0.55495125]]
Reward : [-9.25436987]
--------------------------------------------------
Step : 1
Current State : [-0.98816854 -0.15337192  0.46334615]
Currrent Action : tensor([[0.0415]])
Next State : [[-0.98816854]
 [-0.15337192]
 [ 0.46334615]]
Reward : [-9.0955968]
--------------------------------------------------
Step : 2
Current State : [-0.9847349  -0.17406097  0.41944867]
Currrent Action : tensor([[0.4742]])
Next State : [[-0.9847349 ]
 [-0.17406097]
 [ 0.41944867]]
Reward : [-8.94752521]
--------------------------------------------------
Step : 3
Current State : [-0.98250175 -0.18625337  0.24790601]
Currrent Action : tensor([[-0.2733]])
Next State : [[-0.98250175]
 [-0.18625337]
 [ 0.24790601]]
Reward : [-8.8186246]
--------------------------------------------------
Step : 4
Current S

## The real training 

In [68]:
def ppo_randomizer(
    mini_batch_size,
    states,
    actions,
    log_probs,
    returns,
    advantages
) :

    batch_size = states.size(0)

    for _ in range(batch_size // mini_batch_size):
        rand_indexes = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_indexes], actions[rand_indexes], log_probs[rand_indexes], returns[rand_indexes], advantages[rand_indexes]

def ppo_trainer(
    model,
    optimizer,
    ppo_epochs,
    mini_batch_size,
    states,
    actions,
    log_probs,
    returns,
    advantages,
    clip_param = 0.2
):

    batch_loss = []
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_vals, advantage in ppo_randomizer(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist, value = model(state) # Action space and value chosen by the new policy
            entropy = dist.entropy().mean() #what is the entropy of my new policy distribution
            new_log_probs = dist.log_prob(action) #what is the likelihood that my new policy is to take the old action?

            ratio = (new_log_probs - old_log_probs).exp()

            surrogate_unclipped = ratio * advantage
            surrogate_clipped = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss = -torch.min(surrogate_unclipped,surrogate_clipped).mean()
            critic_loss = (return_vals - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001* entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_loss.append(loss.item())
    
    return np.mean(batch_loss)


In [62]:
def check_model_stats(model, iteration):

    all_params = []

    for param in model.parameters():
        all_params.append(param.data.flatten())
    
    all_params = torch.cat(all_params)

    stats = {
        'iteration': iteration,
        'total_params': all_params.numel(),
        'mean': all_params.mean().item(),
        'std': all_params.std().item(),
        'min': all_params.min().item(),
        'max': all_params.max().item(),
        'norm': all_params.norm().item()
    }
    
    return stats

In [73]:
num_inputs  = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_steps        = 100
mini_batch_size  = 25
ppo_epochs       = 5
threshold_reward = -200
learning_iterations = 500

model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

def train_the_model(
    env,
    model,
    num_learning_iterations
):

    episodic_rewards = []
    episodic_losses = []

    for it in range(num_learning_iterations):
        print(f"learning iteration : {it}")

        frames, rewards, actions, values, masks, states, log_probs, entropy, next_state = collect_experience(
            env=env,
            model=model,
            num_steps=num_steps
        )

        episodic_rewards.append(sum(rewards))
        
        next_state = np.array(next_state).flatten()

        next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
        _, next_value = model(next_state)
        returns = compute_gae(next_value, rewards, masks, values)

        returns = torch.cat(returns).detach()
        log_probs = torch.cat(log_probs).detach()
        values = torch.cat(values).detach()
        states = torch.cat(states)
        actions = torch.cat(actions)
        advantage = returns - values

        loss = ppo_trainer(
            model=model,
            optimizer=optimizer,
            ppo_epochs=ppo_epochs,
            mini_batch_size=mini_batch_size,
            states=states,
            actions=actions,
            log_probs=log_probs,
            returns=returns,
            advantages=advantage
        )

        episodic_losses.append(loss)

    
    return episodic_rewards, episodic_losses



print(check_model_stats(model=model, iteration=0))

train_the_model(
    env=env,
    model=model,
    num_learning_iterations=learning_iterations
)
#check_model_stats(model=model, iteration='inf')


{'iteration': 0, 'total_params': 2563, 'mean': 0.021769622340798378, 'std': 0.0985417440533638, 'min': -0.43270736932754517, 'max': 0.32318469882011414, 'norm': 5.108118534088135}
learning iteration : 0
--------------------------------------------------
Step : 0
Current State : [ 0.6919735 -0.7219229 -0.5213363]
Currrent Action : tensor([[0.2794]])
Next State : [[ 0.6919735]
 [-0.7219229]
 [-0.5213363]]
Reward : [-0.60940066]
--------------------------------------------------
Step : 1
Current State : [ 0.6565972 -0.7542414 -0.9584169]
Currrent Action : tensor([[0.6957]])
Next State : [[ 0.6565972]
 [-0.7542414]
 [-0.9584169]]
Reward : [-0.67822992]
--------------------------------------------------
Step : 2
Current State : [ 0.5972377 -0.8020643 -1.5249134]
Currrent Action : tensor([[-0.0054]])
Next State : [[ 0.5972377]
 [-0.8020643]
 [-1.5249134]]
Reward : [-0.82202315]
--------------------------------------------------
Step : 3
Current State : [ 0.5079634 -0.8613786 -2.1446767]
Curr

ValueError: Expected parameter loc (Tensor of shape (25, 1)) of distribution Normal(loc: torch.Size([25, 1]), scale: torch.Size([25, 1])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan]], grad_fn=<AddmmBackward0>)