<a href="https://colab.research.google.com/github/akterskii/RL/blob/master/Dead%20prediction/TD3PG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym['box2d']

Collecting box2d-py~=2.3.5; extra == "box2d"
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 9.1MB/s 
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [2]:
# Run this cell to mount your Google Drive.
from google.colab import drive
mount = '/content/drive'
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Import

In [0]:
import os 
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#for RAdam
from torch.optim.optimizer import Optimizer, required

# for custom env
from gym.wrappers import TimeLimit

## RAdam optimizer

In [0]:
class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma
                    if N_sma > 5:
                        step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                if N_sma > 5:                    
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                else:
                    p_data_fp32.add_(-step_size, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

## Actor and Critic

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, name):
        super(Actor, self).__init__()
        
        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)
        
        self.max_action = max_action
        
        self.fname = name + '.pth'
        
    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        a = torch.tanh(self.l3(a)) * self.max_action
        return a
    
    
        
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, name, probability=False):
        super(Critic, self).__init__()
        
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)
        
        self.probability = probability
        
        self.name = name
        self.fname = name + '.pth'
        
    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)
        
        q = F.relu(self.l1(state_action))
        q = F.relu(self.l2(q))
        q = self.l3(q)
        if self.probability:
            q = torch.sigmoid(q)
        return q
    

cuda:0


## Agent

In [0]:
class TD3:
    def __init__(self, lr, state_dim, action_dim, max_action, danger, danger_threshold, directory, fname, epochs_for_danger):
        opt_RAdam = True
        
        self.actor = Actor(state_dim, action_dim, max_action, 'actor').to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action, 'actor_target').to(device)
        self.actor_perturbed = Actor(state_dim, action_dim, max_action, 'actor_target').to(device)
        
        self.actor_target.load_state_dict(self.actor.state_dict())
        
        
        if opt_RAdam:
            self.actor_optimizer = RAdam(self.actor.parameters(), lr=lr)
        else:
            self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        
        
        self.critic_1 = Critic(state_dim, action_dim, 'critic_1').to(device)
        self.critic_1_target = Critic(state_dim, action_dim, 'critic_1_target').to(device)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        if opt_RAdam:
            self.critic_1_optimizer = RAdam(self.critic_1.parameters(), lr=lr)
        else:
            self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=lr)
        
        self.critic_2 = Critic(state_dim, action_dim, 'critic_2').to(device)
        self.critic_2_target = Critic(state_dim, action_dim, 'critic_2_target').to(device)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        if opt_RAdam:
            self.critic_2_optimizer = RAdam(self.critic_2.parameters(), lr=lr)
        else:
            self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=lr)
        
        self.max_action = max_action
        
        #danger
        self.danger = danger
        self.actor_danger = Actor(state_dim, action_dim, max_action, 'actor_danger').to(device)
        if opt_RAdam:
            self.actor_danger_optimizer = RAdam(self.actor_danger.parameters(), lr=lr)
        else:
            self.actor_danger_optimizer = optim.Adam(self.actor_danger.parameters(), lr=lr)
        
        self.critic_danger = Critic(state_dim, action_dim, 'critic_danger', probability=True).to(device)
        if opt_RAdam:
            self.critic_danger_optimizer = RAdam(self.critic_danger.parameters(), lr=lr)
        else:       
            self.critic_danger_optimizer = optim.Adam(self.critic_danger.parameters(), lr=lr)
        
        self.threshold = danger_threshold
        self.directory = directory
        self.fname = fname
        self.epochs_for_danger = epochs_for_danger
        self.action_update = False
          
    
    def select_action(self, state, danger=False, param_noise=False, debug=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)        
        
        if param_noise:
            self.actor_perturbed.eval()
            action = self.actor_perturbed(state)
            self.actor_perturbed.train()
        else:
            self.actor.eval()
            action = self.actor(state)
            self.actor.train()
            
        if danger:
            self.action_update = False
            
            # estimate probability of death
            init_prob_danger = self.critic_danger(state, action).cpu().data.numpy().flatten()
            if  init_prob_danger > self.threshold:
                
                # safe action in danger case
                self.actor_danger.eval() # set evaluation mode
                action = self.actor_danger(state)
                self.actor_danger.train() # set vack train mode
                
                self.action_update = True
                
                if debug:
                    new_prob_danger = self.critic_danger(state, action).cpu().data.numpy().flatten()
                    print( "\t\t\tP before: {}. P after: {}".format(init_prob_danger, new_prob_danger))
        
        return action.cpu().data.numpy().flatten()
    
    
    def perturb_actor_parameters(self, param_noise):
        """Apply parameter noise to actor model, for exploration"""
        hard_update(self.actor_perturbed, self.actor)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name: 
                pass 
            param = params[name]
            random = torch.randn(param.shape).to(device)
            
            param += random * param_noise.current_stddev
        
    
    def update(self, replay_buffer, replay_buffer_danger, n_iter, batch_size, batch_size_danger, gamma, polyak, policy_noise, noise_clip, policy_delay):
        
        for i in range(n_iter):
            # Sample a batch of transitions from replay buffer:
            state, action_, reward, next_state, done = replay_buffer.sample(batch_size)            
            state = torch.FloatTensor(state).to(device)
            action = torch.FloatTensor(action_).to(device)
            reward = torch.FloatTensor(reward).reshape((batch_size,1)).to(device)
            next_state = torch.FloatTensor(next_state).to(device)
            done = torch.FloatTensor(done).reshape((batch_size,1)).to(device)
                                    
            # Select next action according to target policy:
            noise = torch.FloatTensor(action_).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.max_action, self.max_action)
            
            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1-done) * gamma * target_Q).detach()
            
                                    
            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()
            
            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()
            
            
            # Delayed policy updates:
            if i % policy_delay == 0:
                # Compute actor loss:
                actor_loss = -self.critic_1(state, self.actor(state)).mean()
                
                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()                
                
                # Polyak averaging update:
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                
                for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                
                for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                  
                  
        
        
        if self.danger and len(replay_buffer_danger.buffer) > 0:
            batch_steps = max(1, replay_buffer_danger.size // batch_size_danger)
            for _ in range(self.epochs_for_danger):
                for j in range(batch_steps):
                    # Sample two batches of transitions: deadend and normals
                    state_not_danger, action_not_danger, _, _, done_not_danger = replay_buffer.sample(batch_size_danger)            
                    state_not_danger = torch.FloatTensor(state_not_danger).to(device)
                    action_not_danger = torch.FloatTensor(action_not_danger).to(device)
                    done_not_danger = torch.FloatTensor(done_not_danger).reshape((batch_size_danger, 1)).to(device)
            
                    state_danger, action_danger, _, _, done_danger = replay_buffer_danger.sample(batch_size_danger)            
                    state_danger = torch.FloatTensor(state_danger).to(device)
                    action_danger = torch.FloatTensor(action_danger).to(device)
                    done_danger = torch.FloatTensor(done_danger).reshape((batch_size_danger, 1)).to(device)
                  
                    # Compute danger probabilities
                    target_Q_not_danger = done_not_danger
                    target_Q_danger = done_danger
                    #pprint("dan_q", target_Q_not_danger, target_Q_danger)
                  
                    # Optimize Critic Danger:
                    current_Q_danger = self.critic_danger(state_danger, action_danger)
                    current_Q_not_danger = self.critic_danger(state_not_danger, action_not_danger)
                    loss_Q_danger = F.mse_loss(current_Q_danger, target_Q_danger)
                    loss_Q_not_danger = F.mse_loss(current_Q_not_danger, target_Q_not_danger)
                    loss_QD =(loss_Q_danger + loss_Q_not_danger)/2
                    self.critic_danger_optimizer.zero_grad()
                    loss_QD.backward()
                    self.critic_danger_optimizer.step()
                    
                    if j % policy_delay == 0:
                        actor_danger_loss = self.critic_danger(state_danger, self.actor_danger(state_danger)).mean()
                    
                        # Optimize the actor for danger
                        self.actor_danger_optimizer.zero_grad()
                        actor_danger_loss.backward()
                        self.actor_danger_optimizer.step()           
                     
                
    def save(self, directory=None, fname=None, optimizers = False, danger = False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname
            
        base_path = "%s/%s_"% (directory, fname)
        
        torch.save(self.actor.state_dict(), base_path + self.actor.fname)
        torch.save(self.actor_target.state_dict(), base_path + self.actor_target.fname)
        
        torch.save(self.critic_1.state_dict(), base_path + self.critic_1.fname)
        torch.save(self.critic_1_target.state_dict(), base_path + self.critic_1_target.fname)
        
        torch.save(self.critic_2.state_dict(), base_path + self.critic_2.fname)
        torch.save(self.critic_2_target.state_dict(), base_path + self.critic_2_target.fname)
        
        if danger:
            torch.save(self.actor_danger.state_dict(),  base_path + self.actor_danger.fname)
            torch.save(self.critic_danger.state_dict(), base_path + self.critic_danger.fname)
        
        if optimizers:
            torch.save(self.actor_optimizer.state_dict(), '%s/%s_actor_optimizer.pth' % (directory, fname))
            torch.save(self.critic_1_optimizer.state_dict(), '%s/%s_critic_1_optimizer.pth' % (directory, fname))
            torch.save(self.critic_2_optimizer.state_dict(), '%s/%s_critic_2_optimizer.pth' % (directory, fname))
            if danger:
                torch.save(self.actor_danger_optimizer.state_dict(), '%s/%s_actor_danger_optimizer.pth' % (directory, fname))
                torch.save(self.critic_danger_optimizer.state_dict(), '%s/%s_critic_danger_optimizer.pth' % (directory, fname))
                
        
    def load(self, directory=None, fname=None, optimizers=False, danger = False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname
            
        base_path = "%s/%s_"% (directory, fname)
        
        self.actor.load_state_dict(torch.load(base_path + self.actor.fname, map_location=lambda storage, loc: storage))
        self.actor_target.load_state_dict(torch.load(base_path + self.actor_target.fname, map_location=lambda storage, loc: storage))
        
        self.critic_1.load_state_dict(torch.load(base_path + self.critic_1.fname, map_location=lambda storage, loc: storage))
        self.critic_1_target.load_state_dict(torch.load(base_path + self.critic_1_target.fname, map_location=lambda storage, loc: storage))
        
        self.critic_2.load_state_dict(torch.load(base_path + self.critic_2.fname, map_location=lambda storage, loc: storage))
        self.critic_2_target.load_state_dict(torch.load(base_path + self.critic_2_target.fname, map_location=lambda storage, loc: storage))
        
        if danger:
            self.actor_danger.load_state_dict(torch.load('%s/%s_actor_danger.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_danger.load_state_dict(torch.load('%s/%s_critic_danger.pth' % (directory, name), map_location=lambda storage, loc: storage))
        
        if optimizers:
            self.actor_optimizer.load_state_dict(torch.load( '%s/%s_actor_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_1_optimizer.load_state_dict(torch.load('%s/%s_critic_1_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_2_optimizer.load_state_dict(torch.load('%s/%s_critic_2_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            if danger:
                self.actor_danger_optimizer.load_state_dict(torch.load(base_path + self.actor_danger.fname, map_location=lambda storage, loc: storage))
                self.critic_danger_optimizer.load_state_dict(torch.load(base_path + self.critic_danger.fname, map_location=lambda storage, loc: storage))
        
        
    def load_actor(self, directory=None, fname=None, danger=False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname      
      
        base_path = "%s/%s_"% (directory, fname)
        self.actor.load_state_dict(torch.load(base_path + self.actor.fname, map_location=lambda storage, loc: storage))
        self.actor_target.load_state_dict(torch.load(base_path + self.actor_target.fname, map_location=lambda storage, loc: storage))
        if danger:
            self.actor_danger.load_state_dict(torch.load(base_path + self.actor_danger.fname, map_location=lambda storage, loc: storage))
            self.critic_danger.load_state_dict(torch.load(base_path + self.critic_danger.fname, map_location=lambda storage, loc: storage))
        
        
        
      

##Custom Bipedal

In [0]:
import numpy as np
import warnings
from gym.envs.box2d.bipedal_walker import *
class CustomizableBipedalWalker(BipedalWalker):
    def __init__(self):
        self.default_params = {
            'stump_height_low': 1,
            'stump_height_high': 3,
            'pit_depth': 4,
            'pit_width_low': 3,
            'pit_width_high': 5,
            'stair_heights': [-.5, .5],
            'stair_width_low': 4,
            'stair_width_high': 5,
            'stair_steps_low': 3,
            'stair_steps_high': 5,
            'states': [0],
            'state_probs': None
        }
        self.params = {**self.default_params}
        BipedalWalker.__init__(self)
        
    def _update_env_params(self, **kwargs):
        # TODO: add kind of sanity check here
        self.params = {**self.params, **kwargs}
        _ = self.reset()
        
    def reset_env_params(self, hardcore=False):
        params = {**self.default_params}
        if hardcore:
            params['states'] = np.arange(4)
        self._update_env_params(**params)
    
    def set_env_states(self, state_mask, p=None):
        """
        :param state_mask: np.array(,dtype=bool) that masks ["GRASS", "STUMP", "STAIRS", "PIT"].
            Note that masking out "GRASS" takes no effect.
        :param p: np.array or list of probabilities: [p_grass, p_stump, p_stairs, p_pit].
            Probs corresponding to masked out states are ignored
        :return: None
        """
        states_ = np.arange(4)[state_mask]
        p_ = None
        if p is not None:
            p_ = np.array(p)
            if not np.all(p_ >= 0):
                raise ValueError
            p_ = p_[state_mask] / p_[state_mask].sum()
        self._update_env_params(states=states_, state_probs=p_)
    
    def set_env_params(self, pit_width=None, stair_width=None, stair_steps=None, stump_height=None):
        """
            NB: All params are integers or tuples of integers
        """
        kwargs = {**locals()}
        _ = kwargs.pop('self', None)
        params = {}
        for k,v in kwargs.items():
            if type(v) is int:
                params[k + '_low'] = v
                params[k + '_high'] = v + 1
            elif isinstance(v, (tuple, list)): 
                if v[1] - v[0] >= 1:
                    params[k + '_low'] = v[0]
                    params[k + '_high'] = v[1]
                else:
                    warnings.warn(f'{k} shoud be an integer. {k}[1] - {k}[0] < 1 '+\
                                  f'=> will set {k}_low = {v[0]}, {k}_high = {v[0]+1}')
                    params[k + '_low'] = v[0]
                    params[k + '_high'] = v[0] + 1
        self._update_env_params(**params)
        
    def _generate_terrain(self, hardcore=True):
        GRASS, STUMP, STAIRS, PIT, _STATES_ = range(5)
        state    = GRASS
        velocity = 0.0
        y        = TERRAIN_HEIGHT
        counter  = TERRAIN_STARTPAD
        oneshot  = False
        self.terrain   = []
        self.terrain_x = []
        self.terrain_y = []
        for i in range(TERRAIN_LENGTH):
            x = i*TERRAIN_STEP
            self.terrain_x.append(x)

            if state==GRASS and not oneshot:
                velocity = 0.8*velocity + 0.01*np.sign(TERRAIN_HEIGHT - y)
                if i > TERRAIN_STARTPAD: velocity += self.np_random.uniform(-1, 1)/SCALE   #1
                y += velocity

            elif state==PIT and oneshot:
                counter = self.np_random.randint(self.params['pit_width_low'], 
                                                 self.params['pit_width_high'])
                PIT_H = self.params['pit_depth']
                poly = [
                    (x,              y),
                    (x+TERRAIN_STEP, y),
                    (x+TERRAIN_STEP, y-PIT_H*TERRAIN_STEP),
                    (x,              y-PIT_H*TERRAIN_STEP),
                    ]
                self.fd_polygon.shape.vertices=poly
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)

                self.fd_polygon.shape.vertices=[(p[0]+TERRAIN_STEP*counter,p[1]) for p in poly]
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)
                counter += 2
                original_y = y

            elif state==PIT and not oneshot:
                y = original_y
                if counter > 1:
                    y -= PIT_H*TERRAIN_STEP

            elif state==STUMP and oneshot:
                counter = self.np_random.randint(self.params['stump_height_low'], self.params['stump_height_high'])
                poly = [
                    (x,                      y),
                    (x+counter*TERRAIN_STEP, y),
                    (x+counter*TERRAIN_STEP, y+counter*TERRAIN_STEP),
                    (x,                      y+counter*TERRAIN_STEP),
                    ]
                self.fd_polygon.shape.vertices=poly
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)

            elif state==STAIRS and oneshot:
                stair_height = self.np_random.choice(self.params['stair_heights'])
                stair_width = self.np_random.randint(self.params['stair_width_low'], 
                                                     self.params['stair_width_high'])
                stair_steps = self.np_random.randint(self.params['stair_steps_low'], 
                                                     self.params['stair_steps_high'])
                original_y = y
                for s in range(stair_steps):
                    poly = [
                        (x+(    s*stair_width)*TERRAIN_STEP, y+(   s*stair_height)*TERRAIN_STEP),
                        (x+((1+s)*stair_width)*TERRAIN_STEP, y+(   s*stair_height)*TERRAIN_STEP),
                        (x+((1+s)*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP),
                        (x+(    s*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP),
                        ]
                    self.fd_polygon.shape.vertices=poly
                    t = self.world.CreateStaticBody(
                        fixtures = self.fd_polygon)
                    t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                    self.terrain.append(t)
                counter = stair_steps*stair_width

            elif state==STAIRS and not oneshot:
                s = stair_steps*stair_width - counter - stair_height
                n = s/stair_width
                y = original_y + (n*stair_height)*TERRAIN_STEP

            oneshot = False
            self.terrain_y.append(y)
            counter -= 1
            if counter==0:
                counter = self.np_random.randint(TERRAIN_GRASS/2, TERRAIN_GRASS)
                if state==GRASS:
                    state = self.np_random.choice(self.params['states'], p=self.params['state_probs'])
                    oneshot = True
                else:
                    state = GRASS
                    oneshot = True

        self.terrain_poly = []
        for i in range(TERRAIN_LENGTH-1):
            poly = [
                (self.terrain_x[i],   self.terrain_y[i]),
                (self.terrain_x[i+1], self.terrain_y[i+1])
                ]
            self.fd_edge.shape.vertices=poly
            t = self.world.CreateStaticBody(
                fixtures = self.fd_edge)
            color = (0.3, 1.0 if i%2==0 else 0.8, 0.3)
            t.color1 = color
            t.color2 = color
            self.terrain.append(t)
            color = (0.4, 0.6, 0.3)
            poly += [ (poly[1][0], 0), (poly[0][0], 0) ]
            self.terrain_poly.append( (poly, color) )
        self.terrain.reverse()

## Noise

In [0]:
class AdaptiveParamNoiseSpec(object):
    def __init__(self, initial_stddev=0.1, desired_action_stddev=0.2, adaptation_coefficient=1.01):
        """
        Note that initial_stddev and current_stddev refer to std of parameter noise, 
        but desired_action_stddev refers to (as name notes) desired std in action space
        """
        self.initial_stddev = initial_stddev
        self.desired_action_stddev = desired_action_stddev
        self.adaptation_coefficient = adaptation_coefficient

        self.current_stddev = initial_stddev

    def adapt(self, sq_distance):
        """
        Expaects 
        """
        
        if sq_distance > self.desired_action_stddev ** 2:
            # Decrease stddev.
            self.current_stddev /= self.adaptation_coefficient
        else:
            # Increase stddev.
            self.current_stddev *= self.adaptation_coefficient

    def get_stats(self):
        stats = {
            'param_noise_stddev': self.current_stddev,
        }
        return stats

    def __repr__(self):
        fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adaptation_coefficient={})'
        return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adaptation_coefficient)

def ddpg_sq_distance_metric(actions1, actions2):
    """
    Compute SQUARE of the "distance" between actions taken by two policies at the same states
    Expects numpy arrays
    """
    diff = actions1-actions2
    mean_diff = np.mean(np.square(diff), axis=0)
    sq_dist = np.mean(mean_diff)
    return sq_dist


def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
           target_param.data.copy_(param.data)
            
            
class Noise:
    def __init__(self, dim):
        self.dim = dim
    
        #normal
    
        #ou
    
    
    def get_noise(self, n_type='normal'):
        if n_type == 'normal':
            return 
    

## Replay buffer

In [0]:
import numpy as np

class ReplayBuffer:
    def __init__(self, max_size=1e6, base_path=None, name=None):
        self.buffer = []
        self.max_size = int(max_size)
        self.size = 0
        
        if base_path is not None:
            self.base_path = base_path
        else:
            self.base_path = ''
            
        if name is not None:
            self.fname = name + '.pth'
        else:
            self.fname = 'buffer.pth'
            
        
    
    def add(self, transition):
        self.size +=1
        # transiton is tuple of (state, action, reward, next_state, done)
        self.buffer.append(transition)
       
    
    def save(self):
        try:
            with open(self.basename + self.fname , 'wb') as f:
                pickle.dump(self.buffer, f)
        except OSError:
            print('Buffer is not saved!\n\n')
            
            
    def load(self):
        try:
            with open(self.basename + self.fname , 'rb') as f:
                self.buffer = pickle.load(f)
        except OSError:
            self.buffer = []
            print('Buffer is not loaded!\n\n')
    
          
    def sample(self, batch_size):
        # delete 1/5th of the buffer when full
        if self.size > self.max_size:
            del self.buffer[0:int(self.size/5)]
            self.size = len(self.buffer)
        
        indexes = np.random.randint(0, len(self.buffer), size=batch_size)
        state, action, reward, next_state, done = [], [], [], [], []
        
        for i in indexes:
            s, a, r, s_, d = self.buffer[i]
            state.append(np.array(s, copy=False))
            action.append(np.array(a, copy=False))
            reward.append(np.array(r, copy=False))
            next_state.append(np.array(s_, copy=False))
            done.append(np.array(d, copy=False))
        
        return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)
     
    def get_last(self, amount):
        state, action, reward, next_state, done = [], [], [], [], []
        if amount < len(self.buffer):
            start = len(self.buffer) - amount
        else:
            start = 0
        for i in range(start,len(self.buffer)):
            s, a, r, s_, d = self.buffer[i]
            state.append(np.array(s, copy=False))
            action.append(np.array(a, copy=False))
            reward.append(np.array(r, copy=False))
            next_state.append(np.array(s_, copy=False))
            done.append(np.array(d, copy=False))
        return s, a, r, s_, d

## Trajectory buffer

In [0]:
from collections import deque
class TrajectoriesEndBuffer():
    def __init__(self, max_trajectory_size=10, init_prob=0.7):
        assert type(max_trajectory_size) is int
        assert max_trajectory_size > 0
        
        self.max_trajectory_size = max_trajectory_size
        self.buffer = deque()
        
        #linear probability params
        assert type(init_prob) is float
        assert 0 <= init_prob <= 1
        self.min_prob = init_prob
        
        #TRY other cases
        
    def add(self, transition):
        if len(self.buffer) >= self.max_trajectory_size:
            self.buffer.popleft()    
        self.buffer.append(transition)
    
    
    def get_heruistc_probabilities(self):
        cur_len = len(self.buffer)
        if cur_len > 0 and self.max_trajectory_size == 1:
            return [1]

        if cur_len < self.max_trajectory_size:
            shift = self.max_trajectory_size - cur_len
        else:
            shift = 0
        prob = [0] * cur_len
        
        # linear from
        min_prob = self.min_prob
        for i in range(cur_len):
            prob[i] = min_prob + (1 - min_prob) * ( (i + shift) / (self.max_trajectory_size - 1) )
            
        return prob
    
    def transfer_to_replay_buffer(self, replay_buffer):
        if len(self.buffer) == 0: 
            print('Trajectories buffer is empty')
            return
        
        if self.buffer[-1][4] == False:
            print('Trajectory is not finished')
            return
          
        cur_len = len(self.buffer)
        prob = self.get_heruistc_probabilities()
        
        for i in range(cur_len):
            s1, a, r, s2, done = self.buffer[i]
            replay_buffer.add((s1, a, r, s2, float(prob[i])))
        
        self.buffer = deque()

## Research class

In [0]:
import random 
class Research:
    def set_danger_batch_size(self):
        self.batch_size_danger = self.batch_size // 2
        
    def update_params(self, params):
        #print(params)
        param_names = ['random_seed', 
                       'max_episodes', 'max_timesteps', 'batch_size', 
                       'lr', 'polyak', 'policy_delay', 'gamma', 'log_interval', 
                       'batch_size_danger', 
                       'policy_noise',
                       'danger_enable', 'danger_threshold', 'max_trajectory_size',
                       'directory','filename_load', 'filename_save', 'epochs_for_danger',
                       'noise_random_enable', 'exploration_noise', 'noise_clip',
                       'max_iter_danger', 'evaluate_after_episodes', 'evaluate_average_on_episodes', 'env_name_load', 
                       'env_name',
                       'param_noise_enable','initial_stddev','desired_action_stddev','adaptation_coefficient']
        
        for param_name in param_names:
            if param_name in params:
                
                setattr(self, param_name, params[param_name])
                print("{} set to {}".format(param_name, params[param_name]))
        
        
    def __init__(self, params):
        ###########################################
        ###         default values
        ###########################################
        
        self.random_seed = 1
        
        # environment
        self.env_rewards = {'BipedalWalkerHardcore-v2':[-100,300], 'BipedalWalker-v2':[-100,300],
                            'LunarLanderContinuous-v2':[-100,200],'Pendulum-v0':[-1000,200]}

        self.env_name_load = 'BipedalWalker-v2'
        self.env_name = 'BipedalWalker-v2'
        
        if self.env_name in self.env_rewards:
            self.terminal_reward = self.env_rewards[self.env_name][0]
            self.passed_reward   = self.env_rewards[self.env_name][1]
        else:
            self.terminal_reward = None
            self.passed_reward   = None
        
        self.env = None
        
        ## training 
        self.max_episodes = 1000         # max num of episodes
        self.max_timesteps = 2000        # max timesteps in one episode
        self.batch_size = 100            # num of transitions sampled from replay buffer
        self.lr = 0.001
        self.polyak = 0.995              # target policy update parameter (1-tau)
        self.policy_delay = 2            # delayed policy updates parameter
        
        ## model params
        self.gamma = 0.99                # discount for future rewards
        self.warmup_steps = 10**4
        
        ##############################   
        ##   Noise params
        ##############################   
        
        # Random noise
        self.noise_random_enable = True
        self.exploration_noise = 0.1
        self.policy_noise = 0.2          # target policy smoothing noise
        self.noise_clip = 0.5
        
        # Param noise
        self.param_noise_enable = False
        self.noise_distance_batch_size = 10 ** 4
        self.initial_stddev=0.05
        self.desired_action_stddev=0.45 
        self.adaptation_coefficient=1.03
        
        ## Danger params
        self.danger_enable = True               # enable dnager mode
        self.danger_threshold = 0.7      # probability to perform safe action
        self.epochs_for_danger = 3       # epochs to train danger
        self.max_iter_danger = 5         # iteration per iteration
        self.batch_size_danger = self.batch_size // 2
        self.max_trajectory_size = 10    # length of trajectory before death
        self.action_updates = 0
        
        ## evaluation 
        self.evaluate_average_on_episodes = 10
        self.evaluate_after_episodes = 30
        self.evaluate_after_timesteps = 10000
        
        ## InpOut params
        self.log_interval = 10                                              # print avg reward after interval
        self.directory = "/content/drive/My Drive"                          # save trained models
        self.filename_load = "TD3_{}_{}".format(self.env_name_load, self.random_seed)
        self.filename_save = "TD3_{}_{}".format(self.env_name, self.random_seed)
        ###########################################
        
        ## Buffers
        self.replay_buffer = None
        self.replay_buffer_dead = None
        self.trajectory_buffer = None
        
        
        ## Policy
        self.policy = None
        
        ###########################################
        ## Update default params
        ###########################################
        self.update_params(params)
        
        
    def load_model(self, env_name, random_seed):
        pass
        
        
    def init_env(self, params = None):
        if params:
            self.update_env(params)
            
        # create env
        if self.env_name == "BipedalWalkerHardcore-v2":
            
            env = CustomizableBipedalWalker()
            env.set_env_params(stump_height=1)
            env.set_env_states(state_mask=np.array([1,1,0,0],dtype=bool), p=np.array([0.1,0.9,0.9,0.9]))
            
            self.env = TimeLimit(env, max_episode_steps=2000)
        else:
            self.env = gym.make(self.env_name)
            
        if '_max_episode_steps' in self.env.__dict__:
            self.max_timesteps = self.env.__dict__['_max_episode_steps']
        else:
            self.max_timesteps = 10**7 ## "inifinte" value if nothing defined
        
        # env params
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        
        #check symmetry of actions
        assert  np.all((-self.env.action_space.low) == self.env.action_space.high)
        
        
    def init_buffers_and_agent(self, load, solved):
        
        # Buffers
        self.replay_buffer = ReplayBuffer()
        if self.danger_enable:
            self.replay_buffer_dead = ReplayBuffer()
            self.trajectory_buffer = TrajectoriesEndBuffer(self.max_trajectory_size)
    
        # Policy
        self.policy = TD3(self.lr, self.state_dim, self.action_dim, self.max_action,  self.danger_enable,
                          self.danger_threshold, self.directory, self.filename_save, self.epochs_for_danger)
    
    
        # load
        if load:
            full_fname = self.filename_load
            if solved:
                full_fname += "_solved"
            self.policy.load(self.directory,  full_fname)
           
        
    def train(self, params, debug=False):
        self.update_params(params)
        self.first_finish_ep_num  = -1
        assert self.policy is not None
        assert self.max_timesteps is not None
        
        # Random seed
        if self.random_seed is not None:
            os.environ['PYTHONHASHSEED']=str(self.random_seed)
            print("Random Seed: {}".format(self.random_seed))
            np.random.seed(self.random_seed)
            self.env.seed(self.random_seed)
            random.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
         
    
        # logging variables:
        avg_reward = 0
        ep_reward = 0
        tot_time_steps = 0
        average_time_steps = 0
    
        log_f = open("log.txt","w+")
    
        # training procedure:
        print('Start training')
        reach_the_end = False
        
        if self.param_noise_enable:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05,desired_action_stddev=0.45, adaptation_coefficient=1.05)
        # amount of updates in dangers
        tot_updates_amount = 0
        self.action_updates = 0
        ep_updates_amount = 0

        
        
        for episode in range(1, self.max_episodes + 1):
            
            ep_updates_amount = 0
            state = self.env.reset()        
            
            if  self.param_noise_enable:
                self.policy.perturb_actor_parameters(param_noise)
            
            for t in range(self.max_timesteps):
                # select action and add exploration noise:
                
                if tot_time_steps + t <= self.warmup_steps:
                    action = self.env.action_space.sample()
                else:
                    action = self.policy.select_action(state, danger=self.danger_enable, param_noise=self.param_noise_enable)
                    if self.policy.action_update:
                        ep_updates_amount += 1
                    
                    if self.noise_random_enable:
                        action = action + np.random.normal(0, self.exploration_noise, size=self.env.action_space.shape[0])
                    action = action.clip(self.env.action_space.low, self.env.action_space.high)
                    
                #print("action {}".format(action))
                # take action in env:
                next_state, reward, done, _ = self.env.step(action)
            
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                if self.danger_enable:
                    self.trajectory_buffer.add((state, action, reward, next_state, float(done)))
            
                if reward == self.terminal_reward:

                    if self.danger_enable:
                        self.trajectory_buffer.transfer_to_replay_buffer(self.replay_buffer_dead)                    
                    
                        if debug:
                            tmp_st = torch.FloatTensor(state.reshape(1, -1)).to(device)
                            tmp_a = torch.FloatTensor(action.reshape(1, -1)).to(device)
                            print("\tDead after pair with prob: ", self.policy.critic_danger(tmp_st, tmp_a).cpu().data.numpy().flatten())
                    
                state = next_state
                avg_reward += reward
                ep_reward += reward
                
                #if done and t < self.max_timesteps - 1 and reward != self.terminal_reward and reach_the_end==False:
                #    print('Reach the end. Episode: {}. r={} t={}'.format(episode, ep_reward, tot_time_steps + t))
                #    reach_the_end = True
                #    self.first_finish_ep_num = episode
                #    break
                
                if done or t==(self.max_timesteps-1): 
                    tot_time_steps += t
                    average_time_steps += t
                    tot_updates_amount += ep_updates_amount 
                    self.action_updates += ep_updates_amount

                    self.policy.update(self.replay_buffer, self.replay_buffer_dead, t, self.batch_size, self.batch_size_danger, self.gamma, self.polyak, self.policy_noise, self.noise_clip, self.policy_delay)
                    if t==(self.max_timesteps - 1):                    
                        print("\tStuck, ep reward {}".format(ep_reward))
                    break
            
            # logging updates:
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
            ep_reward = 0
            
            # if avg reward > 300 then save and stop traning:
            if (avg_reward / self.log_interval) >= 300:
                print("########## Solved! ###########")
                name = self.filename_save + '_solved'
                self.policy.save(self.directory, name, optimizers=True, danger=True)
                log_f.close()
                break
            
            # update perturbed actor
            #print('start perturtbated update')
            if self.param_noise_enable:
                states, actions_perturbed, _, _, _ = self.replay_buffer.get_last(t)
                actions_perturbed = np.array(actions_perturbed, copy=False)
            
                actions_clear = self.policy.select_action(states, danger=False, param_noise=False)
                ddpg_sq_distance = ddpg_sq_distance_metric(actions_perturbed, actions_clear)
                param_noise.adapt(ddpg_sq_distance)
            #print('end perturtbated update t={}'.format(t))
            
            # save policy 
            if episode > 0 and episode % (5 * self.log_interval) == 0:
                self.policy.save(self.directory, self.filename_save, optimizers=True, danger=True)
            
            # print avg reward every log interval:
            if episode % self.log_interval == 0:
                avg_reward = int(avg_reward / self.log_interval)
                average_time_steps = int(average_time_steps / self.log_interval)
                print("Episode: {}\tAverage Reward: {}\tAverage steps: {}\t Total: {}".format(episode, avg_reward, average_time_steps, tot_time_steps))
                avg_reward = 0
                average_time_steps = 0
            
            if episode % self.evaluate_after_episodes == 0 and episode >= 100:
                reach_the_end, mean_reward = self.evaluate(params ={}, time_steps = tot_time_steps, episode = episode)
                if reach_the_end:
                    self.first_finish_ep_num = episode
                    print('Reach the end')
                    print('Updates\tSteps\tReward\tEpisode\tSeed')
                    print('{}\t{}\t{}\t{}\t{}'.format(self.action_updates, tot_time_steps, mean_reward, episode, self.random_seed))
                    break
    
    def evaluate(self, params, time_steps = None, episode = None):
        self.update_params(params)
        print('Evaluation. Timesteps: {} Episodes: {}'.format(time_steps, episode))
        reach_the_end = 0
        rewards = np.zeros(self.evaluate_average_on_episodes)
        for ep in range(self.evaluate_average_on_episodes):
            ep_reward = 0
            state = self.env.reset()
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state, debug=False, danger=True)
                state, reward, done, _ = self.env.step(action)
                ep_reward += reward
                if done:
                    s = "\t\tDead {}\tUpdate {}".format( reward == self.terminal_reward,  self.policy.action_update)
                    
                    if reward != self.terminal_reward and t !=self.max_timesteps-1:
                        print(s+'\tFinished')
                        reach_the_end += 1
                    else:
                        print(s)
                    break

            rewards[ep] = ep_reward 
            
        print('\t\tMean reward: {}. All rewards: {}'.format(int(np.mean(rewards)), list(map(int,rewards)) ))
        return reach_the_end == self.evaluate_average_on_episodes, int(np.mean(rewards))
    
    
    def is_passed(self, step_num, reward):
        if reward != self.terminal_reward:
            if step_num < self.terminal_reward:
                return True

        return False

# Debug


## Test Research class

In [0]:
import datetime
params = {'env_name': "LunarLanderContinuous-v2",
          'random_seed':332447659, 
          'danger_enable':True,
          'max_trajectory_size' : 20,
          'evaluate_after_episodes' : 5,
          'evaluate_average_on_episodes' : 5,
          #'max_timesteps':1000,
          'exploration_noise':0.3,
          'max_episodes' : 1000,}


print(datetime.datetime.now())

new_research = Research(params)

new_research.init_env()
new_research.init_buffers_and_agent(load=False, solved=False)
# prime = {19152527, 21192173,  33535573,  14245223,    285781, 
#          90109337, 634600381, 892392491, 48473791903, 7238744783}
#  'max_timesteps':10,
print(new_research.max_timesteps)
new_research.train({}, debug=True)

2019-11-20 22:06:12.178057
random_seed set to 332447659
max_episodes set to 1000
danger_enable set to True
max_trajectory_size set to 20
exploration_noise set to 0.3
evaluate_after_episodes set to 5
evaluate_average_on_episodes set to 5
env_name set to LunarLanderContinuous-v2
1000
Random Seed: 332447659
Start training
	Dead after pair with prob:  [0.5016838]
	Dead after pair with prob:  [0.5011838]
	Dead after pair with prob:  [0.54413456]
	Dead after pair with prob:  [0.498179]
	Dead after pair with prob:  [0.5008072]
	Dead after pair with prob:  [0.5101867]
	Dead after pair with prob:  [0.5067465]
	Dead after pair with prob:  [0.5376061]
	Dead after pair with prob:  [0.4780513]
	Dead after pair with prob:  [0.617012]
Episode: 10	Average Reward: -195	Average steps: 113	 Total: 1130
	Dead after pair with prob:  [0.7569107]
	Dead after pair with prob:  [0.74205565]
	Dead after pair with prob:  [0.47140846]
	Dead after pair with prob:  [0.80857617]
	Dead after pair with prob:  [0.864101