# Twin-Delayed DDPG

## Installing the packages

In [1]:
# !pip install pybullet

## Importing the libraries

3DT packages

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
# import pybullet_envs
# import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# from gym import wrappers
from torch.autograd import Variable
from collections import deque

Others

In [2]:
# import community
import numpy as np
import networkx as nx
import matplotlib as mpl
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import graphviz
from networkx.drawing.nx_agraph import write_dot, graphviz_layout
import random
import pydoc

from decimal import Decimal
import json
import matplotlib.patches as mpatches

import sys
sys.path.append("..")
from src.utils import ColorPrint as _
# from src.ds import McmcTree as Tree
from datasets.synthetic.new_generator import TreeGenerator

In [3]:
# from new_ds import McmcTree

In [4]:
from tree_for_td3 import TreeForTD3

In [11]:
class TD3(object):
  
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
        for it in range(iterations):
      
            # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # Step 5: From the next state s’, the Actor target plays the next action a’
            next_action = self.actor_target(next_state)

            # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)


### ActionSpace

In [12]:
class ActionSpace(object):
    def __init__(self, M=10, **kwargs):
        self.__M = M
        self.high = [1]
        self.low = [0]
        self.shape = [2*M]
    
    # It must return a random action
    def sample(self):
        
        # generate some random actions
        prune_action = np.random.rand(self.__M-1,)
        reattach_action = np.random.rand(self.__M-1,)

        # normalize actions
        prune_action    = prune_action/np.sum(prune_action)
        reattach_action = reattach_action/np.sum(reattach_action)

        action = np.concatenate([prune_action,reattach_action])
        return action

### ObservationSpace

In [13]:
class ObservationSpace(object):
    def __init__(self, M=10, N=15, **kwargs):
        # D & E: m*n
        self.__M   = M
        self.__N   = N
        self.high  = []
        self.low   = []
        self.shape = [N, M]


### Environment

In [14]:
class Environment():
    def __init__(self, 
                 M=10, 
                 N=15, 
                 ZETA=1, 
                 Gamma=0.15, 
                 alpha=0.002, 
                 beta=0.03, 
                 MR=0.02,
                 save_dir="../tmp"
                ):
        
        # define default variables
        self.observation_space = ObservationSpace()
        self.action_space = ActionSpace(M)
        self._max_episode_steps = 1000
        
        # determine parameters
        self.M     = M
        self.N     = N
        self.ZETA  = ZETA
        self.Gamma = Gamma
        self.alpha = alpha
        self.beta  = beta
        self.MR    = MR
        save_dir   = save_dir
        
        # setup tree generator
        generator = TreeGenerator(
            M,
            N,
            ZETA=ZETA,
            Gamma=Gamma,
            alpha=alpha,
            beta=beta,
            MR=MR,
            save_dir=save_dir,
        )
        
        # generate tree object
        tree_obj = generator.generate()
        
        # retrieve required variables from the tree object
        gt_E, gt_D, D, CNP, gt_T = tree_obj.get_mcmc_tree_data()
        gt_E, gt_D, D = gt_E.T, gt_D.T, D.T,
        
        # some operations
        gensNames = list( str(i) for i in range(M) )
        print("GenesNames:\n\t"+'\n\t'.join(gensNames))
        C_num = D.shape[1]
        G_num = D.shape[0]
        _.print_warn(f'There is {C_num} cells and {G_num} mutations at {len(gensNames)} genes in this dataset.')
        
        # filling missed data
        def tf(m,c):
            os = len(np.where(D[:,c]==1.))*1.
            zs = len(np.where(D[:,c]==0.))*1.
            return 1. if np.random.rand() < os/(os+zs) else 0.
        for m in range(G_num):
            for c in range(C_num):
                if D[m,c] == 3.:
                    D[m,c] = tf(m,c)
        
        # construct the main tree executor as the most important variable in the environment object
        dl = list(d for d in D)
        root = [n for n,d in gt_T.in_degree() if d==0][0]
        print('ROOT:', root)
        T = TreeForTD3(
            gensNames, 
            D, 
            data_list=dl, 
            root=str(root), 
            alpha=alpha, beta=beta, 
            save_dir="../tmp"
        )
        
        # setting ground truth variables
        T.set_ground_truth(gt_D, gt_E, gt_T=gt_T)
        
        # storing the main tree object
        self.tree_obj = T
        
        
    def __prepare_state(self, X):
        pass
        

    def reset(self):
        self.tree_obj.randomize()
        E = self.tree_obj.get_E()
        return self.__prepare_state(E)
        
    
    def step(self, action):
        print(f"step in environment: {action}")
        new_obs, reward, done, _ = self.tree_obj.do_step(action)
        return self.__prepare_state(new_obs), reward, done, _
    
    
    def action_sample():
        pass
    
    
    def seed(self, seed):
        pass

In [15]:
env_name = "AntBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

In [16]:
M = 10
N = 15

env = Environment()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)

action = env.action_space.sample()
env.tree_obj.do_step(action)

GenesNames:
	0
	1
	2
	3
	4
	5
	6
	7
	8
	9
ROOT: 2


[1;33mThere is 15 cells and 10 mutations at 10 genes in this dataset.[0m


NameError: name 'Actor' is not defined

In [9]:
total_timesteps = 0

# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
    
    print(f"total_timesteps:{total_timesteps}")
    # Before 10000 timesteps, we play random actions
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else: # After 10000 timesteps, we switch to the model
        action = policy.select_action(np.array(obs))
        # If the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

    print(f"Action:{action}")
    # The agent performs the action in the environment, then reaches the next state and receives the reward
    new_obs, reward, done, _ = env.step(action)

    # We check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    # We increase the total reward
    episode_reward += reward

    # We store the new transition into the Experience Replay memory (ReplayBuffer)
    replay_buffer.add((obs, new_obs, action, reward, done_bool))

    # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

NameError: name 'max_timesteps' is not defined

## Global Variables

In [None]:
M = 10
N = 15
ZETA = 1
Gamma = 0.15
alpha = 0.002
beta = 0.03
MR = 0.02

tg = TreeGenerator(
    M = M,
    N = N,
    ZETA = ZETA,
    Gamma = Gamma,
    alpha = alpha,
    beta = beta,
    MR = MR,
)
(gt_E, gt_D, D, gt_T) = tg.generate()
gensNames = list( str(i) for i in range(M) )
print(gensNames)



C_num = D.shape[1]
G_num = D.shape[0]
_.print_warn( 'There is {} cells and {} mutations at {} genes in this dataset.'.format(C_num, G_num, len(gensNames)) )



### fill missed data
def tf(m,c):
    os = len(np.where(D[:,c]==1.))*1.
    zs = len(np.where(D[:,c]==0.))*1.
    return 1. if np.random.rand() < os/(os+zs) else 0.

for m in range(G_num):
    for c in range(C_num):
        if D[m,c] == 3.:
            D[m,c] = tf(m,c)


### Run
dl = list(d for d in D)
root = [n for n,d in gt_T.in_degree() if d==0][0]
print('ROOT:', root)
T = Tree(gensNames, D, data_list=dl, root=str(root), alpha=alpha, beta=beta)
T.set_ground_truth(gt_D, gt_E, gt_T=gt_T)

In [None]:
T.randomize()
T.plot_gt_T()
T.plot_best_T('initial T')


T.set_rho(10)

for i in range(100):
    if T.next():
        break


T.plot_all_results()

In [None]:
# shape[0] -> dimension
M = 10,
N = 15,
ZETA = 1,
Gamma = 0.15,
alpha = 0.002,
beta = 0.03,
MR = 0.02,
        
        
class GYM(object):
    def __init__(self):
        pass
    
    def make_env(
        self, name,
        M = 10,
        N = 15,
        ZETA = 1,
        Gamma = 0.15,
        alpha = 0.002,
        beta = 0.03,
        MR = 0.02,
    ):
        env = Environment(M=M, N=N, ZETA=ZETA, Gamma=Gamma, alpha=alpha, beta=beta, MR=MR)
        return env


class ActionSpace(object):
    def __init__(self):
        self.high = [1]
        self.low = [0]
        self.shape = [2*M]
    
    def sample(self):
        pass
    

class ObservationSpace(object):
    def __init__(self):
        self.high = []
        self.low = []
        self.shape = []
    
    
class Environment(M=10, N=15, ZETA=1, Gamma=0.15, alpha=0.002, beta=0.03, MR=0.02):
    def __init__(self):
        # define default variables
        sefl.observation_space = ObservationSpace()
        self.action_space = ActionSpace()
        self._max_episode_steps = 1000
        
        # determine parameters
        self.M     = M
        self.N     = N
        self.ZETA  = ZETA
        self.Gamma = Gamma
        self.alpha = alpha
        self.beta  = beta
        self.MR    = MR
        save_dir   = "../tmp"
        
        # setup tree generator
        generator = TreeGenerator(
            M,
            N,
            ZETA=ZETA,
            Gamma=Gamma,
            alpha=alpha,
            beta=beta,
            MR=MR,
            save_dir=save_dir,
        )
        
        # generate tree object
        tree_obj = generator.generate()
        
        # retrieve required variables from the tree object
        gt_E, gt_D, D, CNP, gt_T = tree_obj.get_mcmc_tree_data()
        gt_E, gt_D, D = gt_E.T, gt_D.T, D.T,
        
        # some operations
        gensNames = list( str(i) for i in range(M) )
        print("GenesNames:\n\t"+'\n\t'.join(gensNames))
        C_num = D.shape[1]
        G_num = D.shape[0]
        _.print_warn( 'There is {} cells and {} mutations at {} genes in this dataset.'.format(C_num, G_num, len(gensNames)) )
        
        # filling missed data
        def tf(m,c):
            os = len(np.where(D[:,c]==1.))*1.
            zs = len(np.where(D[:,c]==0.))*1.
            return 1. if np.random.rand() < os/(os+zs) else 0.
        for m in range(G_num):
            for c in range(C_num):
                if D[m,c] == 3.:
                    D[m,c] = tf(m,c)
        
        # construct the main tree executor as the most important variable in the environment object
        dl = list(d for d in D)
        root = [n for n,d in gt_T.in_degree() if d==0][0]
        print('ROOT:', root)
        T = TreeForTD3orTD3(
            gensNames, 
            D, 
            data_list=dl, 
            root=str(root), 
            alpha=alpha, beta=beta, 
            save_dir="../tmp"
        )
        
        # setting ground truth variables
        T.set_ground_truth(gt_D, gt_E, gt_T=gt_T)
        
        # storing the main tree object
        self.tree_obj = T
        
        
    def __prepare_state(self, X):
        
        

    def reset(self):
        self.tree_obj.randomize()
        E = self.tree_obj.get_E()
        return self.__prepare_state(E)
        
    
    def self.__choose_nodes(Pr, At):
        ''' choose a node to proune and a node to reattach'''
        pr_node = np.random.choice(a=range(len(Pr)),size=1,p=Pr)
        remain_T = sub_tree(T, pr_node)
        at_node = 
        return pr_node, at_node
    
    
    def step(self, action):
        new_obs, reward, done, _ = self.tree_obj.step(action)
        return self.__prepare_state(new_obs), reward, done, _
    
    
    def action_sample():
        pass
    
    
    def seed(self, seed):
        pass

## Step 1: We initialize the Experience Replay memory

In [8]:
class ReplayBuffer(object):

    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        # transition = (obs, new_obs, action, reward, done_bool)
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        for i in ind: 
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## Step 2: We build one neural network for the Actor model and one neural network for the Actor target

In [18]:
class Actor(nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action * torch.tanh(self.layer_3(x))
        return x

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [19]:
class Critic(nn.Module):
  
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # Defining the first Critic neural network
        self.layer_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Defining the second Critic neural network
        self.layer_4 = nn.Linear(state_dim + action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        # Forward-Propagation on the first Critic Neural Network
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # Forward-Propagation on the second Critic Neural Network
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return x1, x2

    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

## Steps 4 to 15: Training Process

In [20]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
        for it in range(iterations):
      
            # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # Step 5: From the next state s’, the Actor target plays the next action a’
            next_action = self.actor_target(next_state)

            # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

            # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
            current_Q1, current_Q2 = self.critic(state, action)

            # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

            # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
            if it % policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

## We make a function that evaluates the policy by calculating its average reward over 10 episodes

In [21]:
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0.
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print ("---------------------------------------")
    print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
    print ("---------------------------------------")
    return avg_reward

## We set the parameters

In [22]:
env_name = "AntBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [23]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_AntBulletEnv-v0_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [24]:
if not os.path.exists("./results"):
    os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

## We create the PyBullet environment

In [25]:
# env = gym.make(env_name)
env = Environment()

GenesNames:
	0
	1
	2
	3
	4
	5
	6
	7
	8
	9
ROOT: 5


[1;33mThere is 15 cells and 10 mutations at 10 genes in this dataset.[0m


## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [26]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

## We create the policy network (the Actor model)

In [27]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [28]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [29]:
evaluations = [evaluate_policy(policy)]

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

## We create a new folder directory in which the final results (videos of the agent) will be populated

In [None]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()

## We initialize the variables

In [None]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [None]:
# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
  
    # If the episode is done
    if done:

        # If we are not at the very beginning, we start the training process of the model
        if total_timesteps != 0:
            print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

        # We evaluate the episode and we save the policy
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            policy.save(file_name, directory="./pytorch_models")
            np.save("./results/%s" % (file_name), evaluations)

        # When the training step is done, we reset the state of the environment
        obs = env.reset()

        # Set the Done to False
        done = False

        # Set rewards and episode timesteps to zero
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1
  
    # Before 10000 timesteps, we play random actions
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else: # After 10000 timesteps, we switch to the model
        action = policy.select_action(np.array(obs))
        # If the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
    # The agent performs the action in the environment, then reaches the next state and receives the reward
    new_obs, reward, done, _ = env.step(action)

    # We check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    # We increase the total reward
    episode_reward += reward

    # We store the new transition into the Experience Replay memory (ReplayBuffer)
    replay_buffer.add((obs, new_obs, action, reward, done_bool))

    # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

## Inference

In [None]:
class Actor(nn.Module):
  
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action * torch.tanh(self.layer_3(x)) 
        return x

class Critic(nn.Module):
  
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # Defining the first Critic neural network
        self.layer_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Defining the second Critic neural network
        self.layer_4 = nn.Linear(state_dim + action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        # Forward-Propagation on the first Critic Neural Network
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # Forward-Propagation on the second Critic Neural Network
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return x1, x2

    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
        for it in range(iterations):
      
            # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # Step 5: From the next state s’, the Actor target plays the next action a’
            next_action = self.actor_target(next_state)

            # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

            # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
            current_Q1, current_Q2 = self.critic(state, action)

            # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

            # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

        # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
        if it % policy_freq == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0.
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print ("---------------------------------------")
    print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
    print ("---------------------------------------")
    return avg_reward

env_name = "AntBulletEnv-v0"
seed = 0

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, './pytorch_models/')
_ = evaluate_policy(policy, eval_episodes=eval_episodes)