In [1]:
import parl
from paddle import fluid
from parl.core.fluid.policy_distribution import CategoricalDistribution

In [2]:
class AtariModel(parl.Model):
    """
    only use for atari
    
    
    """
    
    def __init__(self,actdim):
        
        self.conv1 = parl.layers.conv2d(num_filters=32,filter_size=8,stride=4,padding=1,act='relu')
        self.conv2 = parl.layers.conv2d(num_filters=64,filter_size=4,stride=2,padding=2,act='relu')
        self.conv3 = parl.layers.conv2d(num_filters=64,filter_size=3,stride=1,padding=0,act='relu')
        self.fc = parl.layers.fc(size=512,act='relu')
        
        self.policy_fc = parl.layers.fc(size=actdim,act=None)
        self.value_fc  = parl.layers.fc(size=1,act=None)
        
    def policy(self,obs):
        obs = obs / 255.0
        
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        
        flatten = fluid.layers.flatten(conv3,axis=1)
        
        fc_output = self.fc(flatten)
        
        policy_logits = self.policy_fc(fc_output)
        return policy_logits
    def value(self,obs):
        obs = obs / 255.0
        
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        
        flatten = fluid.layers.flatten(conv3,axis=1)
        
        fc_output = self.fc(flatten)
        
        values = self.value_fc(fc_output)
        values = fluid.layers.squeeze(values,axes=[1])
        return values
    def policy_and_value(self,obs):
        
        """
        alg sample use it .
        
        INPUT : [ B,OBSERVATION_SPACE]
        OUTPUT: [BATCHSIZE ACTDIM], [BATCHSIZE]
        """
        obs = obs / 255.0
        
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        
        flatten = fluid.layers.flatten(conv3,axis=1)
        
        fc_output = self.fc(flatten)
        
        policy_logits = self.policy_fc(fc_output)
        values = self.value_fc(fc_output)
        # squeeze ..
        values = fluid.layers.squeeze(values,axes=[1])
        return policy_logits,values

In [18]:
class a2c(parl.Algorithm):
    def __init__(self,model,config):
        
        self.model = model
        self.vf_coeff = config['vf_coeff']
    def value(self,obs):
        value = self.model.value(obs)
        return value
    def predict(self,obs):
        logits= self.model.policy(obs)
        probs = fluid.layers.softmax(logits,axis=1)
        predict_acts = fluid.layers.argmax(probs,axis=1)
        return predict_acts
        
        
    def sample(self,obs):
        logits ,value= self.model.policy_and_value(obs)
        probs = fluid.layers.softmax(logits,axis=1)
        sample_acts = fluid.layers.sampling_id(probs)
        return sample_acts,value
    def learn(self,obs,act,adv,vtag,lr,ent_coeff):
        logits =self.model.policy(obs)
        
        policy_distributions = CategoricalDistribution(logits)
        
        action_log_probs = policy_distributions.logp(act)
        
        pi_loss = -1.0  * fluid.layers.reduce_sum(action_log_probs * adv)
        
        values = self.model.value(obs)
        
        delta = values - vtag
        vloss = 0.5*fluid.layers.reduce_sum(fluid.layers.square(delta))
        
        policy_entropy = policy_distributions.entropy()
        
        entropy = fluid.layers.reduce_sum(policy_entropy)
        
        total_loss = (pi_loss + vloss * self.vf_coeff + ent_coeff * entropy )
        
        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(lr)
        optimizer.minimize(total_loss)
        
        return total_loss,pi_loss,vloss,entropy
    
        
    

In [24]:
from parl.utils.scheduler import PiecewiseScheduler,LinearDecayScheduler

class Agent(parl.Agent):
    
    def __init__(self,alg,config):
        self.obs_shape = config['obs_shape']
        
        super(Agent, self).__init__(alg)

        self.lr_scheduler = LinearDecayScheduler(config['start_lr'],
                                                 config['max_sample_steps'])

        self.entropy_coeff_scheduler = PiecewiseScheduler(
            config['entropy_coeff_scheduler'])
    def sample(self,obs):
        
        feed = 
        
        
    def build_program(self):
        
        self.sample_program = fluid.Program()
        self.predict_program = fluid.Program()
        self.value_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.sample_program):
            obs = fluid.layers.data('obs',self.obs_shape,'float32')
            self.sample_actions, self.sample_values = self.algorithm.sample(obs)
            
        with fluid.program_guard(self.predict_program):
            obs = fluid.layers.data('obs',self.obs_shape,'float32')
            self.predict_acts = self.algorithm.predict(obs)

        with fluid.program_guard(self.value_program):
            obs = fluid.layers.data('obs',self.obs_shape,'float32')
            self.values = self.algorithm.value(obs)

        with fluid.program_guard(self.learn_program):
            obs = fluid.layers.data('obs',self.obs_shape,'float32')
            act = fluid.layers.data('act',[],'int64')
            adv = fluid.layers.data('adv',[],'float32')
            vtag = fluid.layers.data('vtag',[],'float32')
            
            lr = fluid.layers.data(
                name='lr', shape=[1], dtype='float32', append_batch_size=False)
            entropy_coeff = fluid.layers.data(
                name='entropy_coeff', shape=[], dtype='float32')

            total_loss, pi_loss, vf_loss, entropy = self.algorithm.learn(
                obs, act, adv, vtag, lr, entropy_coeff)
            self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]

SyntaxError: invalid syntax (<ipython-input-24-5614b25ca2d8>, line 17)

In [19]:
#test
config ={}
config['vf_coeff'] = 0.1
config['start_lr'] = 1e-3
config['max_sample_steps'] = 100
config['entropy_coeff_scheduler'] =([0 , -0.01])

obs = fluid.layers.data('obs',[4,80,80],'float32')
act = fluid.layers.data('act',[],'int64')
adv = fluid.layers.data('adv',[],'float32')
vtag= fluid.layers.data('vtag',[],'float32')

In [20]:
model = AtariModel(6)

In [21]:
alg = a2c(model,config)

In [22]:
alg.learn(obs,act,adv,vtag,0.1,0.1)

(name: "tmp_61"
 type {
   type: LOD_TENSOR
   lod_tensor {
     tensor {
       data_type: FP32
       dims: 1
     }
     lod_level: 0
   }
 }, name: "tmp_44"
 type {
   type: LOD_TENSOR
   lod_tensor {
     tensor {
       data_type: FP32
       dims: 1
     }
     lod_level: 0
   }
 }, name: "tmp_49"
 type {
   type: LOD_TENSOR
   lod_tensor {
     tensor {
       data_type: FP32
       dims: 1
     }
     lod_level: 0
   }
 }, name: "reduce_sum_25.tmp_0"
 type {
   type: LOD_TENSOR
   lod_tensor {
     tensor {
       data_type: FP32
       dims: 1
     }
   }
 }
 persistable: false)