In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import os
from torch.distributions.multivariate_normal import MultivariateNormal
from utils import *
from cem import CEM

In [2]:
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.stds = torch.Tensor([ 0.1,  0.1])
        # nn.Parameter(torch.ones(n_actions))*0.1
        self.actor = nn.Sequential(
            nn.Linear(2, 8),
            nn.Tanh(),
            nn.Linear(8, 8),
            nn.Tanh(),
            nn.Linear(8, 2),
            nn.Sigmoid()
            )

    def forward(self, state):
        mu= self.actor(state.float())
        k=torch.Tensor([10,10])    
        dist = MultivariateNormal(loc=mu*k,scale_tril=torch.diag(self.stds) )
        return dist

In [3]:
# 112个参数
def get_para_array(model):
    para=np.zeros(114)
    para[:16] = model.state_dict()['actor.0.weight'].reshape(-1).numpy()
    para[16:24] = model.state_dict()['actor.0.bias'].reshape(-1).numpy()
    para[24:88] = model.state_dict()['actor.2.weight'].reshape(-1).numpy()
    para[88:96] = model.state_dict()['actor.2.bias'].reshape(-1).numpy()
    para[96:112] = model.state_dict()['actor.4.weight'].reshape(-1).numpy()
    para[112:114] = model.state_dict()['actor.4.bias'].reshape(-1).numpy()
    
    return para

def update_para_tensor(model,para_array):
    tensor_list=[]
    tensor_list.append(torch.from_numpy(para_array[:16].reshape([8,2])))
    tensor_list.append(torch.from_numpy(para_array[16:24]))
    tensor_list.append(torch.from_numpy(para_array[24:88].reshape([8,8])))
    tensor_list.append(torch.from_numpy(para_array[88:96]))
    tensor_list.append(torch.from_numpy(para_array[96:112].reshape([2,8])))
    tensor_list.append(torch.from_numpy(para_array[112:114]))

    model.state_dict()['actor.0.weight'].copy_(tensor_list[0])
    model.state_dict()['actor.0.bias'].copy_(tensor_list[1])
    model.state_dict()['actor.2.weight'].copy_(tensor_list[2])
    model.state_dict()['actor.2.bias'].copy_(tensor_list[3])
    model.state_dict()['actor.4.weight'].copy_(tensor_list[4])
    model.state_dict()['actor.4.bias'].copy_(tensor_list[5])
    

In [10]:
actor0= Actor()
actor1=Actor()

In [11]:
para0=get_para_array(actor0)
para1=get_para_array(actor1)
print(para0)
print(para1)

[ 0.69590276 -0.29910487  0.6939314   0.14993623 -0.48842365  0.06651034
 -0.25525758  0.52788478  0.67436624  0.00531489 -0.62435496  0.5336386
  0.50414371  0.38237357  0.20454632 -0.28448683 -0.5315513  -0.46473855
 -0.52503413  0.49420872 -0.02796359 -0.51185477  0.65425658 -0.36188632
  0.0021881  -0.01603738  0.2688995  -0.07050021 -0.33359504  0.18825345
  0.23515703  0.25892466 -0.07574366 -0.3348363  -0.25958183  0.00742737
  0.16798094 -0.09658397  0.32031482  0.01029218  0.09861287 -0.15829438
 -0.2317508  -0.25902158  0.21892649  0.15190984  0.1562417   0.06453201
  0.24486859  0.12200531 -0.01069527  0.05246237  0.27347794 -0.16135693
  0.30199456 -0.28770864 -0.10271777 -0.17433012  0.32842553  0.07329766
 -0.14459299 -0.2833955   0.33494499 -0.2034532  -0.05309664  0.17539242
  0.18350038  0.01621613 -0.34562245 -0.35040268  0.28225622  0.22798675
 -0.34094059  0.08362162 -0.24378617 -0.13010632  0.09725988 -0.32171291
 -0.05726914 -0.15881257 -0.33190069 -0.09666451  0.

In [None]:
policy= Actor()
def nn_score(para):
    limits= np.zeros([20,2])
    via_point= []
    score= np.zeros(20)
    update_para_tensor(policy,para)
    
    for limit in limits:
        via_point.append(policy.actor(limit))
        
    for i in range(20):
        score[i] = get_reward(limits[i],via_point[i])
        
    return score.mean()
        
        
    
    
    
    

In [None]:
# cem的get reward是nn的好坏
