In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import random
from collections import namedtuple, deque

In [148]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device =  "cpu"
env = gym.make("LunarLanderContinuous-v2")
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]

ROLLOUT_LENGTH = 300
DISCOUNT = 0.99
GAE_LAMBDA = 0.95
OPTIMIZATION_EPOCHS = 10
MINI_BATCH_SIZE = 64
PPO_RATIO_CLIP = 0.1
GRADIENT_CLIP = 0.75
HIDDEN_LAYERS = 32

print("State size {} with action size {}".format(state_size, action_size))

State size 8 with action size 2


In [149]:
def layer_init(layer, w_scale=1.0):
    nn.init.orthogonal_(layer.weight.data)
    layer.weight.data.mul_(w_scale)
    nn.init.constant_(layer.bias.data, 0)
    return layer

def to_np(t):
    return t.cpu().detach().numpy()

def tensor(x):
    if isinstance(x, torch.Tensor):
        return x
    x = torch.tensor(x, device=DEVICE, dtype=torch.float32)
    return x


In [4]:
class SubNetwork(nn.Module):
    
    def __init__(self, input_size, hidden_units, output_size, seed):
        super(SubNetwork, self).__init__()
        dims = (input_size,) + hidden_units        
        self.layers = nn.ModuleList([layer_init(nn.Linear(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
        self.feature_dim = dims[-1]
        self.output_layer = layer_init(nn.Linear(self.feature_dim, output_size), 1e-3)
        
    def forward(self, x):
        for layer in self.layers:
            x = F.tanh(layer(x))
        x = self.output_layer(x)    
        return x    
            
class ActorAndCritic(nn.Module):
    
    def __init__(self, state_size, action_size, seed):
        super(ActorAndCritic, self).__init__()
        self.seed = random.seed(seed)
        self.actor = SubNetwork(state_size, (HIDDEN_LAYERS, HIDDEN_LAYERS), action_size, seed)
        self.critic = SubNetwork(state_size, (HIDDEN_LAYERS, HIDDEN_LAYERS), 1, seed)
        self.std = nn.Parameter(torch.zeros(action_size))
        #self.to(Config.DEVICE)
        
    def forward(self, obs, action=None):
        obs = tensor(obs)
        a = self.actor(obs)
        v = self.critic(obs)
        mean = F.tanh(a)
        dist = torch.distributions.Normal(mean, F.softplus(self.std))
        return (v, dist)

In [5]:
network = ActorAndCritic(state_size, action_size, 0).to(device)
network

ActorAndCritic(
  (actor): SubNetwork(
    (layers): ModuleList(
      (0): Linear(in_features=8, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, bias=True)
    )
    (output_layer): Linear(in_features=32, out_features=2, bias=True)
  )
  (critic): SubNetwork(
    (layers): ModuleList(
      (0): Linear(in_features=8, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, bias=True)
    )
    (output_layer): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [45]:
def get_actions_from_states(network, states):
    if not isinstance(states, torch.Tensor):
        states = torch.tensor(states, dtype=torch.float32, device=device)
    value, distribution = network.forward(states)
    actions = distribution.sample()
    return to_np(actions), distribution

def get_log_probability_from_actions(distribution, action):
    if not isinstance(action, torch.Tensor):
        action = torch.Tensor(action)
    log_prob = distribution.log_prob(action)
    return log_prob.cpu().detach().numpy()

In [46]:
#Random walk
state = env.reset()
while True:
    action, distribution = get_actions_from_states(network, state)
    log_prob = get_log_probability_from_actions(distribution, action)
    print(log_prob)
    env.render()
    next_state, reward, done, _ = env.step(action)
    state = next_state
    time.sleep(0.01)
    if done: break
env.close()

[-0.5533819 -0.6063934]
[-1.0682343  -0.67481214]
[-0.6873418 -1.4387639]
[-0.93667763 -0.58069086]
[-1.6443069 -1.1178186]
[-0.5640476 -1.4939203]
[-2.0048227 -0.9400481]
[-1.1385506 -1.8745768]
[-0.67953557 -0.9073213 ]
[-1.6155418 -1.1281716]
[-1.8016616 -0.96171  ]
[-0.72881395 -0.713392  ]
[-0.8190851 -2.767889 ]
[-0.8583958  -0.55447936]
[-0.8847645 -1.8550687]
[-0.62972987 -1.7197249 ]
[-0.9113766 -2.1528919]
[-0.5716667  -0.73920506]
[-1.0815481 -0.5524727]
[-0.7813052 -0.553159 ]
[-0.6056081 -0.6062563]
[-4.2128825 -0.6141937]
[-0.55996317 -0.55570626]
[-3.6227856 -1.2892565]
[-0.58017385 -1.1333708 ]
[-0.59656024 -0.5636894 ]
[-0.56163955 -1.2321153 ]
[-0.690972  -0.8925452]
[-0.7885031 -0.7024864]
[-0.5536846 -0.6583319]
[-0.727553  -0.7092635]
[-0.5544032 -0.8956994]
[-1.8938342 -1.0060309]
[-0.9235957 -1.2359898]
[-0.58892035 -1.1839755 ]
[-0.8933141 -0.7482045]
[-2.883668   -0.66225207]
[-0.745499  -0.8579817]
[-0.68677086 -0.6182796 ]
[-0.741604   -0.60437477]
[-0.663054

In [150]:
class Rollout:
    def __init__(self):
        self.actions = []
        self.states = []
        self.next_states = []
        self.rewards = []
        self.dones = []
        self.log_probs = []

    def save_rollout_data(self, state, action, next_state, reward, done, log_prob):
        self.states.append(state)
        self.dones.append(done)
        self.next_states.append(next_state)
        self.actions.append(action)
        self.log_probs.append(log_prob)
        self.rewards.append(reward)

    def stack():
        pass

    def sample(self):
        pass
    

    

In [167]:
class MasterAgent():

    def __init__(self, state_size, action_size, seed):
        self.network = ActorAndCritic(state_size, action_size, seed)
        self.avg_score = 0
        self.score_list = deque(maxlen=100)
        
    def rollout_init(self):
        self.rollout = Rollout()
    
    def optimize(self):
        pass

    def step(self, state, action, next_state, reward, done, log_prob):
        state = torch.Tensor(state, device = device).float()
        next_state = torch.Tensor(next_state, device = device).float()
        done = torch.Tensor(np.asarray([int(done)]), device = device)
        log_prob = torch.Tensor(log_prob,device = device).float()
        print(reward)
        reward = torch.from_numpy(np.array(reward)).float().to(device)
        self.rollout.save_rollout_data(state, action, next_state, reward, done, log_prob)

    def train(self, iteration = 300):
        self.state = env.reset()
        for i in range(iteration):
            score = self.run(env)
            self.score_list.append(score)
            self.optimize()
            

    def run(self, env):
        self.rollout_init()
        score = 0
        for _ in range(ROLLOUT_LENGTH):
            action, distribution = get_actions_from_states(network, self.state)
            log_prob = get_log_probability_from_actions(distribution, action)
            next_state, reward, done, _ = env.step(action)
            score += reward
            self.step(self.state, action, next_state, reward, done, log_prob)
            self.state = next_state
            if done:
                print("done ::")
                state = env.reset()
        print("Terminating Rollout")
        return score


In [168]:
session = MasterAgent(state_size, action_size, 2)

In [169]:
session.train(1)

-1.6822729861988819
-1.7769707745241021
1.5995700275338731
0.8455030373247439
-0.21232941387847176
-0.5229431662789977
2.069900260704375
0.23091047849355278
2.7883758734073028
-0.05187137608511649
2.8235027222528517
-1.1572961784098141
1.0864126988149962
-1.5347723274809937
-2.3274323032559026
-1.695044264568736
1.210190002278441
-1.1246487410591204
2.9524988279789524
-1.5124328598566978
2.0065915482176493
-0.13214134473037575
0.9851526697788494
-1.9431199683852856
-1.2966554019340435
-1.4433808178748109
-1.1167082256080665
-1.2339868258239335
0.6700127434333286
-1.3320208947115577
1.1334867155859
-1.4918790027380737
0.9547303723916127
2.7816275368304106
1.0105501386580045
1.316530174100086
-1.4345578643577142
-1.3886496004588764
-0.17606720438456702
0.5680505899652758
-1.4245620469851588
-1.3477102709893949
1.3528141814781236
-1.4388081787896272
2.3554146139291405
-0.15304242332038484
2.4228806311628275
-1.4009885041630241
2.0283548002000544
-1.3932165699621624
-1.5744022745604855
-1.

In [171]:
session.rollout.dones

[tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0.]),
 tensor([0

In [140]:
torch.Tensor(np.asarray([int(done)]))

tensor([1.])