## Import everything that we need here

In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Using cuda
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

print(use_cuda)
print(device)

True
cuda


## Create Environments

In [4]:
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v1"

def make_env():
    def _thunk():
        env = gym.make(env_name, render_mode="human")
        return env 
    return _thunk()

envs = [make_env() for i in range(num_envs)]
#envs = SubprocVecEnv(envs)

env = gym.make(env_name, render_mode="rgb_array")

In [5]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>

## Defining the NN

In [34]:
def init_weights(m):
    # print(f"This is m : {m}")
    # print(f"This is the type of m : {type(m)}")
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
    def __init__(
            self,
            num_inputs,
            num_outputs,
            hidden_size,
            std=0.0
    ):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )

        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)

        self.apply(init_weights)

    def forward(
            self,
            x
    ):
        value = self.critic(x)
        mu = self.actor(x)
        std = self.log_std.exp().expand_as(mu)
        dist = Normal(mu, std)
        return dist, value

In [35]:
num_input = env.observation_space.shape[0]
num_output = env.action_space.shape[0]

print(num_input, num_output)

hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200

model = ActorCritic(num_input, num_output, hidden_size).to(device)

for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

optimizer = optim.Adam(model.parameters(), lr=lr)

3 1
log_std: torch.Size([1, 1])
critic.0.weight: torch.Size([256, 3])
critic.0.bias: torch.Size([256])
critic.2.weight: torch.Size([1, 256])
critic.2.bias: torch.Size([1])
actor.0.weight: torch.Size([256, 3])
actor.0.bias: torch.Size([256])
actor.2.weight: torch.Size([1, 256])
actor.2.bias: torch.Size([1])


In [36]:
state, info = env.reset()
env.render()
state = torch.FloatTensor(state).unsqueeze(0).to(device)

state, state.shape

(tensor([[ 0.1495, -0.9888, -0.0220]], device='cuda:0'), torch.Size([1, 3]))

In [62]:
def collect_experience(
    env,
    model,
    num_steps
):
    pass

    frames = []
    rewards = []
    actions = []
    values  = []
    masks = []
    states = []
    log_probs = []
    entropy = 0

    state, info = env.reset()

    for step in range(num_steps):
        
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _, _ = env.step(action.cpu().numpy()) #if you pass in action.cpy().numpy()[0] we will get a scalar value or single value tuple (3,)

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        states.append(state)
        actions.append(action)
        
        state = np.array(next_state).flatten()

        print('-' * 50)
        print(f"Step : {step}")
        print(f"Current State : {state}")
        print(f"Currrent Action : {action}")
        print(f"Next State : {next_state}")
        
        # The interesting part is how the reward is calculated
        # reward is -(angle_cost + velocity_cost + action_cost)

        # angle_cost = angle**2                      Penalty for being away from upright (0°)
        # velocity_cost = 0.1 * angular_velocity**2  Penalty for moving too fast
        # action_cost = 0.001 * action**2            Penalty for using large torques
        print(f"Reward : {reward}")
        
        #frame = env.render()
        #frames.append(frame)
    
    return (
        frames,
        rewards,
        actions,
        values,
        masks,
        states,
        log_probs,
        entropy,
        next_state
    )



In [70]:
# Generalized advtange exstimate is the total advantage that is expected at any particular time. Advtange is the reward that will be used to train the RL network
def compute_gae(next_value, 
            rewards,
            values, 
            masks,
            gamma=0.9, 
            tau=0.95
        ):
    
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):

        advantage_current_step = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]

        # multi step advantage
        gae = advantage_current_step + gamma * tau * masks[step] * gae

        return_value = gae + values[step]

        returns.insert(0, return_value)
    
    return returns

In [72]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

def test_env(
    env,
    model,
    total_steps = 100,
    vis=False,
):
    
    frames, rewards, actions, values, masks, states, log_probs, entropy, next_state = collect_experience(
        env,
        model,
        total_steps
    )

    next_state = np.array(next_state).flatten()
    total_reward = sum(rewards)

    next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)

    return (
        total_reward,
        returns
    )

# This is how the environment works 
final_reward,returns = test_env(
    env,
    model,
    vis=True
)

print(final_reward)
print(returns)

--------------------------------------------------
Step : 0
Current State : [ 0.5180702 -0.8553381 -0.8166661]
Currrent Action : tensor([[0.3133]], device='cuda:0')
Next State : [[ 0.5180702]
 [-0.8553381]
 [-0.8166661]]
Reward : [-0.976742]
--------------------------------------------------
Step : 1
Current State : [ 0.45955065 -0.8881516  -1.3420826 ]
Currrent Action : tensor([[0.7739]], device='cuda:0')
Next State : [[ 0.45955065]
 [-0.8881516 ]
 [-1.3420826 ]]
Reward : [-1.1203858]
--------------------------------------------------
Step : 2
Current State : [ 0.37543204 -0.9268499  -1.8525264 ]
Currrent Action : tensor([[1.0378]], device='cuda:0')
Next State : [[ 0.37543204]
 [-0.9268499 ]
 [-1.8525264 ]]
Reward : [-1.3765163]
--------------------------------------------------
Step : 3
Current State : [ 0.26140964 -0.96522796 -2.407611  ]
Currrent Action : tensor([[0.9337]], device='cuda:0')
Next State : [[ 0.26140964]
 [-0.96522796]
 [-2.407611  ]]
Reward : [-1.7504954]
-----------