<a href="https://colab.research.google.com/github/akterskii/RL/blob/master/Dead%20prediction/TD3PG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install gym['box2d']

Collecting box2d-py>=2.3.5; extra == "box2d" (from gym[box2d])
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 2.8MB/s 
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [17]:
# Run this cell to mount your Google Drive.
from google.colab import drive
mount = '/content/drive'
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Actor and Critic

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, name):
        super(Actor, self).__init__()
        
        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)
        
        self.max_action = max_action
        
        self.fname = name + '.pth'
        
    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        a = torch.tanh(self.l3(a)) * self.max_action
        return a
    
    
        
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, name, probability=False):
        super(Critic, self).__init__()
        
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)
        
        self.probability = probability
        
        self.name = name
        self.fname = name + '.pth'
        
    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)
        
        q = F.relu(self.l1(state_action))
        q = F.relu(self.l2(q))
        q = self.l3(q)
        if self.probability:
            q = torch.sigmoid(q)
        return q
    

cuda:0


## Agent

In [0]:
class TD3:
    def __init__(self, lr, state_dim, action_dim, max_action, danger_threshold, directory, fname, epochs_for_danger):
        
        self.actor = Actor(state_dim, action_dim, max_action, 'actor').to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action, 'actor_target').to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        
        self.critic_1 = Critic(state_dim, action_dim, 'critic_1').to(device)
        self.critic_1_target = Critic(state_dim, action_dim, 'critic_1_target').to(device)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=lr)
        
        self.critic_2 = Critic(state_dim, action_dim, 'critic_2').to(device)
        self.critic_2_target = Critic(state_dim, action_dim, 'critic_2_target').to(device)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=lr)
        
        self.max_action = max_action
        
        #danger
        self.actor_danger = Actor(state_dim, action_dim, max_action, 'actor_danger').to(device)
        self.actor_danger_optimizer = optim.Adam(self.actor_danger.parameters(), lr=lr)
        
        self.critic_danger = Critic(state_dim, action_dim, 'critic_danger', probability=True).to(device)
        self.critic_danger_optimizer = optim.Adam(self.critic_danger.parameters(), lr=lr)
        
        self.threshold = danger_threshold
        self.directory = directory
        self.fname = fname
        self.epochs_for_danger = epochs_for_danger
        self.action_update = False
    
    def select_action(self, state, debug=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        action = self.actor(state)
        
        init_prob_danger = self.critic_danger(state, action).cpu().data.numpy().flatten()
        
        self.action_update = False
        if  init_prob_danger > self.threshold:
            action = self.actor_danger(state)
            new_prob_danger = self.critic_danger(state, action).cpu().data.numpy().flatten()
            self.action_update = True
            if debug:
              print( "\t\t\tP before: {}. P after: {}".format(init_prob_danger, new_prob_danger))
            
        return action.cpu().data.numpy().flatten()
    
    
        
    def update(self, replay_buffer, replay_buffer_danger, n_iter, batch_size, batch_size_danger, gamma, polyak, policy_noise, noise_clip, policy_delay):
        
        for i in range(n_iter):
            # Sample a batch of transitions from replay buffer:
            state, action_, reward, next_state, done = replay_buffer.sample(batch_size)            
            state = torch.FloatTensor(state).to(device)
            action = torch.FloatTensor(action_).to(device)
            reward = torch.FloatTensor(reward).reshape((batch_size,1)).to(device)
            next_state = torch.FloatTensor(next_state).to(device)
            done = torch.FloatTensor(done).reshape((batch_size,1)).to(device)
                                    
            # Select next action according to target policy:
            noise = torch.FloatTensor(action_).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.max_action, self.max_action)
            
            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1-done) * gamma * target_Q).detach()
            
                                    
            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()
            
            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()
            
            
            # Delayed policy updates:
            if i % policy_delay == 0:
                # Compute actor loss:
                actor_loss = -self.critic_1(state, self.actor(state)).mean()
                
                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()                
                
                # Polyak averaging update:
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                
                for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                
                for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
                    target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
                  
                  
        batch_steps = max(1, replay_buffer_danger.size // batch_size_danger)
        
        for _ in range(self.epochs_for_danger):
          for j in range(batch_steps):
            # Sample two batches of transitions: deadend and normals
            state_not_danger, action_not_danger, _, _, done_not_danger = replay_buffer.sample(batch_size_danger)            
            state_not_danger = torch.FloatTensor(state_not_danger).to(device)
            action_not_danger = torch.FloatTensor(action_not_danger).to(device)
            done_not_danger = torch.FloatTensor(done_not_danger).reshape((batch_size_danger, 1)).to(device)
            
            state_danger, action_danger, _, _, done_danger = replay_buffer_danger.sample(batch_size_danger)            
            state_danger = torch.FloatTensor(state_danger).to(device)
            action_danger = torch.FloatTensor(action_danger).to(device)
            done_danger = torch.FloatTensor(done_danger).reshape((batch_size_danger, 1)).to(device)
                  
            # Compute danger probabilities
            target_Q_not_danger = done_not_danger
            target_Q_danger = done_danger
            #pprint("dan_q", target_Q_not_danger, target_Q_danger)
                  
            # Optimize Critic Danger:
            current_Q_danger = self.critic_danger(state_danger, action_danger)
            current_Q_not_danger = self.critic_danger(state_not_danger, action_not_danger)
            loss_Q_danger = F.mse_loss(current_Q_danger, target_Q_danger)
            loss_Q_not_danger = F.mse_loss(current_Q_not_danger, target_Q_not_danger)
            loss_QD =(loss_Q_danger + loss_Q_not_danger)/2
            self.critic_danger_optimizer.zero_grad()
            loss_QD.backward()
            self.critic_danger_optimizer.step()
                  
            if j % policy_delay == 0:
              actor_danger_loss = self.critic_danger(state_danger, self.actor_danger(state_danger)).mean()
                    
              # Optimize the actor for danger
              self.actor_danger_optimizer.zero_grad()
              actor_danger_loss.backward()
              self.actor_danger_optimizer.step()           
                     
                
    def save(self, directory=None, fname=None, optimizers = False, danger = False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname
            
        base_path = "%s/%s_"% (directory, fname)
        
        torch.save(self.actor.state_dict(), base_path + self.actor.fname)
        torch.save(self.actor_target.state_dict(), base_path + self.actor_target.fname)
        
        torch.save(self.critic_1.state_dict(), base_path + self.critic_1.fname)
        torch.save(self.critic_1_target.state_dict(), base_path + self.critic_1_target.fname)
        
        torch.save(self.critic_2.state_dict(), base_path + self.critic_2.fname)
        torch.save(self.critic_2_target.state_dict(), base_path + self.critic_2_target.fname)
        
        if danger:
            torch.save(self.actor_danger.state_dict(),  base_path + self.actor_danger.fname)
            torch.save(self.critic_danger.state_dict(), base_path + self.critic_danger.fname)
        
        if optimizers:
            torch.save(self.actor_optimizer.state_dict(), '%s/%s_actor_optimizer.pth' % (directory, fname))
            torch.save(self.critic_1_optimizer.state_dict(), '%s/%s_critic_1_optimizer.pth' % (directory, fname))
            torch.save(self.critic_2_optimizer.state_dict(), '%s/%s_critic_2_optimizer.pth' % (directory, fname))
            if danger:
                torch.save(self.actor_danger_optimizer.state_dict(), '%s/%s_actor_danger_optimizer.pth' % (directory, fname))
                torch.save(self.critic_danger_optimizer.state_dict(), '%s/%s_critic_danger_optimizer.pth' % (directory, fname))
                
        
    def load(self, directory=None, fname=None, optimizers=False, danger = False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname
            
        base_path = "%s/%s_"% (directory, fname)
        
        self.actor.load_state_dict(torch.load(base_path + self.actor.fname, map_location=lambda storage, loc: storage))
        self.actor_target.load_state_dict(torch.load(base_path + self.actor_target.fname, map_location=lambda storage, loc: storage))
        
        self.critic_1.load_state_dict(torch.load(base_path + self.critic_1.fname, map_location=lambda storage, loc: storage))
        self.critic_1_target.load_state_dict(torch.load(base_path + self.critic_1_target.fname, map_location=lambda storage, loc: storage))
        
        self.critic_2.load_state_dict(torch.load(base_path + self.critic_2.fname, map_location=lambda storage, loc: storage))
        self.critic_2_target.load_state_dict(torch.load(base_path + self.critic_2_target.fname, map_location=lambda storage, loc: storage))
        
        if danger:
            self.actor_danger.load_state_dict(torch.load('%s/%s_actor_danger.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_danger.load_state_dict(torch.load('%s/%s_critic_danger.pth' % (directory, name), map_location=lambda storage, loc: storage))
        
        if optimizers:
            self.actor_optimizer.load_state_dict(torch.load( '%s/%s_actor_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_1_optimizer.load_state_dict(torch.load('%s/%s_critic_1_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            self.critic_2_optimizer.load_state_dict(torch.load('%s/%s_critic_2_optimizer.pth' % (directory, name), map_location=lambda storage, loc: storage))
            if danger:
                self.actor_danger_optimizer.load_state_dict(torch.load(base_path + self.actor_danger.fname, map_location=lambda storage, loc: storage))
                self.critic_danger_optimizer.load_state_dict(torch.load(base_path + self.critic_danger.fname, map_location=lambda storage, loc: storage))
        
        
    def load_actor(self, directory=None, fname=None, danger=False):
        if directory is None:
            directory = self.directory
        if fname is None:
            fname = self.fname      
      
        base_path = "%s/%s_"% (directory, fname)
        self.actor.load_state_dict(torch.load(base_path + self.actor.fname, map_location=lambda storage, loc: storage))
        self.actor_target.load_state_dict(torch.load(base_path + self.actor_target.fname, map_location=lambda storage, loc: storage))
        if danger:
          self.actor_danger.load_state_dict(torch.load(base_path + self.actor_danger.fname, map_location=lambda storage, loc: storage))
          self.critic_danger.load_state_dict(torch.load(base_path + self.critic_danger.fname, map_location=lambda storage, loc: storage))
        
        
        
      

##Custom Bipedal

In [0]:
import numpy as np
import warnings
from gym.envs.box2d.bipedal_walker import *
class CustomizableBipedalWalker(BipedalWalker):
    def __init__(self):
        self.default_params = {
            'stump_height_low': 1,
            'stump_height_high': 3,
            'pit_depth': 4,
            'pit_width_low': 3,
            'pit_width_high': 5,
            'stair_heights': [-.5, .5],
            'stair_width_low': 4,
            'stair_width_high': 5,
            'stair_steps_low': 3,
            'stair_steps_high': 5,
            'states': [0],
            'state_probs': None
        }
        self.params = {**self.default_params}
        BipedalWalker.__init__(self)
        
    def _update_env_params(self, **kwargs):
        # TODO: add kind of sanity check here
        self.params = {**self.params, **kwargs}
        _ = self.reset()
        
    def reset_env_params(self, hardcore=False):
        params = {**self.default_params}
        if hardcore:
            params['states'] = np.arange(4)
        self._update_env_params(**params)
    
    def set_env_states(self, state_mask, p=None):
        """
        :param state_mask: np.array(,dtype=bool) that masks ["GRASS", "STUMP", "STAIRS", "PIT"].
            Note that masking out "GRASS" takes no effect.
        :param p: np.array or list of probabilities: [p_grass, p_stump, p_stairs, p_pit].
            Probs corresponding to masked out states are ignored
        :return: None
        """
        states_ = np.arange(4)[state_mask]
        p_ = None
        if p is not None:
            p_ = np.array(p)
            if not np.all(p_ >= 0):
                raise ValueError
            p_ = p_[state_mask] / p_[state_mask].sum()
        self._update_env_params(states=states_, state_probs=p_)
    
    def set_env_params(self, pit_width=None, stair_width=None, stair_steps=None, stump_height=None):
        """
            NB: All params are integers or tuples of integers
        """
        kwargs = {**locals()}
        _ = kwargs.pop('self', None)
        params = {}
        for k,v in kwargs.items():
            if type(v) is int:
                params[k + '_low'] = v
                params[k + '_high'] = v + 1
            elif isinstance(v, (tuple, list)): 
                if v[1] - v[0] >= 1:
                    params[k + '_low'] = v[0]
                    params[k + '_high'] = v[1]
                else:
                    warnings.warn(f'{k} shoud be an integer. {k}[1] - {k}[0] < 1 '+\
                                  f'=> will set {k}_low = {v[0]}, {k}_high = {v[0]+1}')
                    params[k + '_low'] = v[0]
                    params[k + '_high'] = v[0] + 1
        self._update_env_params(**params)
        
    def _generate_terrain(self, hardcore=True):
        GRASS, STUMP, STAIRS, PIT, _STATES_ = range(5)
        state    = GRASS
        velocity = 0.0
        y        = TERRAIN_HEIGHT
        counter  = TERRAIN_STARTPAD
        oneshot  = False
        self.terrain   = []
        self.terrain_x = []
        self.terrain_y = []
        for i in range(TERRAIN_LENGTH):
            x = i*TERRAIN_STEP
            self.terrain_x.append(x)

            if state==GRASS and not oneshot:
                velocity = 0.8*velocity + 0.01*np.sign(TERRAIN_HEIGHT - y)
                if i > TERRAIN_STARTPAD: velocity += self.np_random.uniform(-1, 1)/SCALE   #1
                y += velocity

            elif state==PIT and oneshot:
                counter = self.np_random.randint(self.params['pit_width_low'], 
                                                 self.params['pit_width_high'])
                PIT_H = self.params['pit_depth']
                poly = [
                    (x,              y),
                    (x+TERRAIN_STEP, y),
                    (x+TERRAIN_STEP, y-PIT_H*TERRAIN_STEP),
                    (x,              y-PIT_H*TERRAIN_STEP),
                    ]
                self.fd_polygon.shape.vertices=poly
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)

                self.fd_polygon.shape.vertices=[(p[0]+TERRAIN_STEP*counter,p[1]) for p in poly]
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)
                counter += 2
                original_y = y

            elif state==PIT and not oneshot:
                y = original_y
                if counter > 1:
                    y -= PIT_H*TERRAIN_STEP

            elif state==STUMP and oneshot:
                counter = self.np_random.randint(self.params['stump_height_low'], self.params['stump_height_high'])
                poly = [
                    (x,                      y),
                    (x+counter*TERRAIN_STEP, y),
                    (x+counter*TERRAIN_STEP, y+counter*TERRAIN_STEP),
                    (x,                      y+counter*TERRAIN_STEP),
                    ]
                self.fd_polygon.shape.vertices=poly
                t = self.world.CreateStaticBody(
                    fixtures = self.fd_polygon)
                t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                self.terrain.append(t)

            elif state==STAIRS and oneshot:
                stair_height = self.np_random.choice(self.params['stair_heights'])
                stair_width = self.np_random.randint(self.params['stair_width_low'], 
                                                     self.params['stair_width_high'])
                stair_steps = self.np_random.randint(self.params['stair_steps_low'], 
                                                     self.params['stair_steps_high'])
                original_y = y
                for s in range(stair_steps):
                    poly = [
                        (x+(    s*stair_width)*TERRAIN_STEP, y+(   s*stair_height)*TERRAIN_STEP),
                        (x+((1+s)*stair_width)*TERRAIN_STEP, y+(   s*stair_height)*TERRAIN_STEP),
                        (x+((1+s)*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP),
                        (x+(    s*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP),
                        ]
                    self.fd_polygon.shape.vertices=poly
                    t = self.world.CreateStaticBody(
                        fixtures = self.fd_polygon)
                    t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6)
                    self.terrain.append(t)
                counter = stair_steps*stair_width

            elif state==STAIRS and not oneshot:
                s = stair_steps*stair_width - counter - stair_height
                n = s/stair_width
                y = original_y + (n*stair_height)*TERRAIN_STEP

            oneshot = False
            self.terrain_y.append(y)
            counter -= 1
            if counter==0:
                counter = self.np_random.randint(TERRAIN_GRASS/2, TERRAIN_GRASS)
                if state==GRASS:
                    state = self.np_random.choice(self.params['states'], p=self.params['state_probs'])
                    oneshot = True
                else:
                    state = GRASS
                    oneshot = True

        self.terrain_poly = []
        for i in range(TERRAIN_LENGTH-1):
            poly = [
                (self.terrain_x[i],   self.terrain_y[i]),
                (self.terrain_x[i+1], self.terrain_y[i+1])
                ]
            self.fd_edge.shape.vertices=poly
            t = self.world.CreateStaticBody(
                fixtures = self.fd_edge)
            color = (0.3, 1.0 if i%2==0 else 0.8, 0.3)
            t.color1 = color
            t.color2 = color
            self.terrain.append(t)
            color = (0.4, 0.6, 0.3)
            poly += [ (poly[1][0], 0), (poly[0][0], 0) ]
            self.terrain_poly.append( (poly, color) )
        self.terrain.reverse()

## Replay buffer

In [0]:
import numpy as np

class ReplayBuffer:
    def __init__(self, max_size=1e6, base_path=None, name=None, queue_length=None):
        self.buffer = []
        self.max_size = int(max_size)
        self.size = 0
        
        if base_path is not None:
            self.base_path = base_path
        else:
            self.base_path = ''
            
        if name is not None:
            self.fname = name + '.pth'
        else:
            self.fname = 'buffer.pth'
            
        if queue_length is None:
            self.queue_length = 5
        else:
            self.queue_length = queue_length

    
    def add(self, transition):
        self.size +=1
        # transiton is tuple of (state, action, reward, next_state, done)
        self.buffer.append(transition)
    
    
    def get_probabilities(self, i):
      # linear
      
      return 
    
    def add_queue(self, queue):
        cur_len = min(len(queue), self.queue_length)
        for i in range(cur_len):
            self.add()
    
    def save(self):
        try:
          with open(self.basename + self.fname , 'wb') as f:
            pickle.dump(self.buffer, f)
        except OSError:
            print('Buffer is not saved!\n\n')
            
            
    def load(self):
        try:
          with open(self.basename + self.fname , 'rb') as f:
            self.buffer = pickle.load(f)
        except OSError:
            self.buffer = []
            print('Buffer is not loaded!\n\n')
    
          
    def sample(self, batch_size):
        # delete 1/5th of the buffer when full
        if self.size > self.max_size:
            del self.buffer[0:int(self.size/5)]
            self.size = len(self.buffer)
        
        indexes = np.random.randint(0, len(self.buffer), size=batch_size)
        state, action, reward, next_state, done = [], [], [], [], []
        
        for i in indexes:
            s, a, r, s_, d = self.buffer[i]
            state.append(np.array(s, copy=False))
            action.append(np.array(a, copy=False))
            reward.append(np.array(r, copy=False))
            next_state.append(np.array(s_, copy=False))
            done.append(np.array(d, copy=False))
        
        return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)

## Test


In [0]:
from PIL import Image

def test(random_seed = None, env_name = "BipedalWalker-v2", solved = False):
    
    if random_seed is None:
        random_seed = 1
    n_episodes = 5
    lr = 0.002
    max_timesteps = 2000
    render = False
    save_gif = False
    
    filename = "TD3_{}_{}".format(env_name, random_seed)
    if solved:
        filename += '_solved'
    directory = "/content/drive/My Drive" #"./preTrained/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(lr, state_dim, action_dim, max_action, 2, directory, filename,epochs_for_danger=1)
    
    print(directory, filename)
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/{}.jpg'.format(t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
        

##Eval

In [0]:
def evaluate(policy, env, n_episodes =10, debug=False, terminal_reward=-100):
  max_timesteps = 2000
  rewards = np.zeros(n_episodes)
  for ep in range( n_episodes):
      ep_reward = 0
      state = env.reset()
      for t in range(max_timesteps):
          action = policy.select_action(state, debug=True)
          state, reward, done, _ = env.step(action)
          ep_reward += reward
          if done:
              print("\t\tDead {}\tUpdate {}".format(reward==terminal_reward,  policy.action_update))
              break
      rewards[ep] = ep_reward 
            
  print('\t\tMean reward: {}. All rewards: {}'.format(int(np.mean(rewards)),list(map(int,rewards)) ))
      
    

##Train

In [0]:
def train(load = False, solved=False, env_name_old = "BipedalWalker-v2", 
          env_name_new = "BipedalWalkerHardcore-v2", terminal_reward=-100 ):
    ######### Hyperparameters #########
    log_interval = 10           # print avg reward after interval
    random_seed = 1000000009
    gamma = 0.99                # discount for future rewards
    batch_size = 100            # num of transitions sampled from replay buffer
    batch_size_danger = batch_size // 2
    lr = 0.001
    exploration_noise = 0.25
    polyak = 0.995              # target policy update parameter (1-tau)
    policy_noise = 0.2          # target policy smoothing noise
    noise_clip = 0.5
    policy_delay = 2            # delayed policy updates parameter
    max_episodes = 2511         # max num of episodes
    max_timesteps = 2000        # max timesteps in one episode
    danger_threshold = 0.7      # probability to perform safe action
    directory = "/content/drive/My Drive"# "./preTrained/{}".format(env_name) # save trained models
    filename_load = "TD3_{}_{}".format(env_name_old, random_seed)
    filename_save = "TD3_{}_{}".format(env_name_new, random_seed)
    epochs_for_danger = 3
    max_iter_danger = 5
    episodes_to_evaluate = 30
    ###################################
    
    
    env = CustomizableBipedalWalker()
    env.set_env_params(stump_height=1)
    env.set_env_states(state_mask=np.array([1,1,0,0],dtype=bool), p=np.array([0.1,0.9,0.9,0.9]))
    
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    # Buffers
    replay_buffer = ReplayBuffer()
    replay_buffer_dead = ReplayBuffer()
    
    # Policy
    policy = TD3(lr, state_dim, action_dim, max_action, danger_threshold, directory, filename_save, epochs_for_danger)
    
    
    # load
    if load:
        full_fname = filename_load
        if solved:
            full_fname += "_solved"
        policy.load(directory,  full_fname)
        base_path=None, name=None
        replay_buffer
    else:
        policy.save(directory, filename_save)
             
    
    # Random seed
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # logging variables:
    avg_reward = 0
    ep_reward = 0
    tot_time_steps = 0
    average_time_steps = 0
    
    log_f = open("log.txt","w+")
    
    # training procedure:
    print('Start training')
    
    
    for episode in range(1, max_episodes+1):
            
        state = env.reset()
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            
            replay_buffer.add((state, action, reward, next_state, float(done)))
            
            if reward == terminal_reward:
                tmp_st = torch.FloatTensor(state.reshape(1, -1)).to(device)
                tmp_a = torch.FloatTensor(action.reshape(1, -1)).to(device)
                print("\tDead after pair with prob: ", policy.critic_danger(tmp_st, tmp_a).cpu().data.numpy().flatten())
                replay_buffer_dead.add((state, action, reward, next_state, float(done)))
            
            state = next_state
            
            avg_reward += reward
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1): 
                tot_time_steps += t
                average_time_steps += t
                if t==(max_timesteps-1):                    
                  print("\tStuck!")
                policy.update(replay_buffer,replay_buffer_dead, t, batch_size, batch_size_danger, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        # logging updates:
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
        ep_reward = 0
        
        # if avg reward > 300 then save and stop traning:
        if (avg_reward/log_interval) >= 300:
            print("########## Solved! ###########")
            name = filename_save + '_solved'
            policy.save(directory, name, optimizers=True, danger=True)
            log_f.close()
            break
        
        if episode > 1 and episode % (5 * log_interval) == 0:
            policy.save(directory, filename_save, optimizers=True, danger=True)
            
        
        # print avg reward every log interval:
        if episode % log_interval == 0:
            avg_reward = int(avg_reward / log_interval)
            average_time_steps = int(average_time_steps / log_interval)
            print("Episode: {}\tAverage Reward: {}\tAverage steps: {}\t Total: {}".format(episode, avg_reward, average_time_steps, tot_time_steps))
            avg_reward = 0
            average_time_steps = 0
            
        if episode % episodes_to_evaluate == 0 and episode > 0:
            evaluate(policy, env, debug=False)
    return policy, replay_buffer, replay_buffer_dead
  

## Runs

In [0]:
test(random_seed = 1000000009, env_name = "BipedalWalker-v2", solved = True)

/content/drive/My Drive TD3_BipedalWalker-v2_1000000009_solved
Episode: 1	Reward: 303
Episode: 2	Reward: 304
Episode: 3	Reward: 303
Episode: 4	Reward: 303
Episode: 5	Reward: 304


In [0]:
policy, replay_buffer, replay_buffer_dead = train(load=True, solved=True, env_name_old = "BipedalWalker-v2")

Random Seed: 1000000009
Start training
	Dead after pair with prob:  [0.51351476]
	Dead after pair with prob:  [0.55542177]
	Dead after pair with prob:  [0.5719095]
	Dead after pair with prob:  [0.40145162]
	Dead after pair with prob:  [0.6086938]
	Dead after pair with prob:  [0.13611373]
	Dead after pair with prob:  [0.3589761]
	Dead after pair with prob:  [0.3530743]
	Dead after pair with prob:  [0.63434535]
	Dead after pair with prob:  [0.9353273]
Episode: 10	Average Reward: -89	Average steps: 172	 Total: 1723
	Dead after pair with prob:  [0.00905418]
	Dead after pair with prob:  [0.9884227]
	Dead after pair with prob:  [0.9845589]
	Dead after pair with prob:  [0.98658633]
	Dead after pair with prob:  [0.97460765]
	Dead after pair with prob:  [0.98030925]
	Dead after pair with prob:  [0.9558893]
	Dead after pair with prob:  [0.9693786]
	Dead after pair with prob:  [0.9675684]
	Dead after pair with prob:  [0.49256596]
Episode: 20	Average Reward: -97	Average steps: 155	 Total: 3279
	De

In [0]:
policy, replay_buffer, replay_buffer_dead = train(load=True, solved=False, 
                                                  env_name_old = "BipedalWalkerHardcore-v2", rbuf=replay_buffer, rbuf_dead=replay_buffer_dead)

NameError: ignored

In [0]:
test(1000000009, "BipedalWalker-v2", solved=True)

/content/drive/My Drive TD3_BipedalWalker-v2_1000000009_solved
Episode: 1	Reward: 303
Episode: 2	Reward: 303
Episode: 3	Reward: 303
Episode: 4	Reward: 304
Episode: 5	Reward: 303


# Debug


## Test targets

In [0]:
log_interval = 10           # print avg reward after interval
random_seed = 1
gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
lr = 0.001
exploration_noise = 0.15 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 1500         # max num of episodes
max_timesteps = 2000        # max timesteps in one episode
directory = "/content/drive/My Drive"# "./preTrained/{}".format(env_name) # save trained models

    
env = CustomizableBipedalWalker()
env.set_env_params(stump_height=2)
    
env.set_env_states(state_mask=np.array([1,1,0,0],dtype=bool), p=np.array([0.1,0.9,0.9,0.9]))
    
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
    
policy = TD3(lr, state_dim, action_dim, max_action, danger_threshold = .6)

In [0]:
env = CustomizableBipedalWalker()
env.set_env_params(stump_height=2)    
env.set_env_states(state_mask=np.array([1,1,0,0],dtype=bool), p=np.array([0.1,0.9,0.9,0.9]))

replay_buffer = ReplayBuffer()
    
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])


state = env.reset()
done = False
while not done:
    
    action = policy.select_action(state)
    action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
    action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
    next_state, reward, done, _ = env.step(action)
    replay_buffer.add((state, action, reward, next_state, float(done)))
    state = next_state
    print(reward, reward> -100)



In [0]:
state, action_, reward, next_state, done = replay_buffer.sample(batch_size) 

In [0]:
policy.actor_danger()

Actor(
  (l1): Linear(in_features=24, out_features=400, bias=True)
  (l2): Linear(in_features=400, out_features=300, bias=True)
  (l3): Linear(in_features=300, out_features=4, bias=True)
)

In [0]:
state, action_, reward, next_state, done = replay_buffer.sample(batch_size) 
state = torch.FloatTensor(state.reshape(1, -1)).to(device)

In [0]:
env_name_old = "BipedalWalker-v2"
env_name_new = "BipedalWalkerHardcore-v2"
random_seed = 1000000009
    
directory = "/content/drive/My Drive"# "./preTrained/{}".format(env_name) # save trained models
filename_load = "TD3_{}_{}".format(env_name_old, random_seed)
filename_save = "TD3_{}_{}".format(env_name_new, random_seed)

policy.save(directory, filename_save, optimizers=True, danger=True)

In [0]:
!mv '/content/drive/My Drive/TD3_BipedalWalker-v2_1000000009_solved_crtic_2.pth'  '/content/drive/My Drive/TD3_BipedalWalker-v2_1000000009_solved_critic_2.pth'
!ls  '/content/drive/My Drive/'

##Last