## Import everything that we need here

In [39]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [40]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
# Using cuda
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

print(use_cuda)
print(device)

True
cuda


## Create Environments

In [51]:
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v1"

def make_env():
    def _thunk():
        env = gym.make(env_name, render_mode="human")
        return env 
    return _thunk()

envs = [make_env() for i in range(num_envs)]
#envs = SubprocVecEnv(envs)

env = gym.make(env_name, render_mode="rgb_array")

In [52]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>

## Defining the NN

In [53]:
def init_weights(m):
    print(f"This is m : {m}")
    print(f"This is the type of m : {type(m)}")

    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
    def __init__(
            self,
            num_inputs,
            num_outputs,
            hidden_size,
            std=0.0
    ):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )

        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)

        self.apply(init_weights)

    def forward(
            self,
            x
    ):
        value = self.critic(x)
        mu = self.actor(x)
        std = self.log_std.exp().expand_as(mu)
        dist = Normal(mu, std)
        return dist, value

In [54]:
num_input = env.observation_space.shape[0]
num_output = env.action_space.shape[0]

print(num_input, num_output)

hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200

model = ActorCritic(num_input, num_output, hidden_size).to(device)

for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

optimizer = optim.Adam(model.parameters(), lr=lr)

3 1
This is m : Linear(in_features=3, out_features=256, bias=True)
This is the type of m : <class 'torch.nn.modules.linear.Linear'>
This is m : ReLU()
This is the type of m : <class 'torch.nn.modules.activation.ReLU'>
This is m : Linear(in_features=256, out_features=1, bias=True)
This is the type of m : <class 'torch.nn.modules.linear.Linear'>
This is m : Sequential(
  (0): Linear(in_features=3, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=1, bias=True)
)
This is the type of m : <class 'torch.nn.modules.container.Sequential'>
This is m : Linear(in_features=3, out_features=256, bias=True)
This is the type of m : <class 'torch.nn.modules.linear.Linear'>
This is m : ReLU()
This is the type of m : <class 'torch.nn.modules.activation.ReLU'>
This is m : Linear(in_features=256, out_features=1, bias=True)
This is the type of m : <class 'torch.nn.modules.linear.Linear'>
This is m : Sequential(
  (0): Linear(in_features=3, out_features=256, bias=True)
  

In [55]:
state, info = env.reset()
env.render()
state = torch.FloatTensor(state).unsqueeze(0).to(device)

state, state.shape

Authorization required, but no authorization protocol specified


(tensor([[ 0.8853, -0.4650,  0.9080]], device='cuda:0'), torch.Size([1, 3]))

In [56]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

def test_env(
    env,
    model,
    total_steps = 100,
    vis=False,
):
    state, info = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    step = 0

    while ((not done) and (step < total_steps)):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        
        action = dist.sample().cpu().numpy()[0]
        next_state, reward, done, _, _ = env.step(action)
        
        print('-' * 50)
        print(f"Step : {step}")
        print(f"Current State : {state}")
        print(f"Currrent Action : {action}")
        print(f"Next State : {next_state}")
        print(f"Reward : {reward}")
        # The interesting part is how the reward is calculated
        # reward is -(angle_cost + velocity_cost + action_cost)

        # angle_cost = angle**2                      Penalty for being away from upright (0°)
        # velocity_cost = 0.1 * angular_velocity**2  Penalty for moving too fast
        # action_cost = 0.001 * action**2            Penalty for using large torques

        state = next_state
        if vis: env.render()

        total_reward += reward
        step += 1
        
    return total_reward

In [57]:
# This is how the environment works 
final_reward = test_env(
    env,
    model,
    vis=True
)

print(final_reward)

--------------------------------------------------
Step : 0
Current State : tensor([[ 0.9338, -0.3579, -0.1309]], device='cuda:0')
Currrent Action : [0.47046965]
Next State : [ 0.9277451  -0.37321448 -0.32879534]
Reward : -0.1359148422056937
--------------------------------------------------
Step : 1
Current State : tensor([[ 0.9277, -0.3732, -0.3288]], device='cuda:0')
Currrent Action : [0.684273]
Next State : [ 0.9180056  -0.3965675  -0.50606525]
Reward : -0.15756328950725693
--------------------------------------------------
Step : 2
Current State : tensor([[ 0.9180, -0.3966, -0.5061]], device='cuda:0')
Currrent Action : [0.68767244]
Next State : [ 0.903559   -0.42846364 -0.70034003]
Reward : -0.19236332914947593
--------------------------------------------------
Step : 3
Current State : tensor([[ 0.9036, -0.4285, -0.7003]], device='cuda:0')
Currrent Action : [-1.769825]
Next State : [ 0.87413234 -0.48568776 -1.2871615 ]
Reward : -0.2482444137093595
---------------------------------

In [64]:
for a in range(5,0,-1):
    print(a)

5
4
3
2
1
