In [2]:
import gym, torch, numpy as np, torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import tianshou as ts
from copy import deepcopy
from tianshou.env import DummyVectorEnv
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.distributions import Independent, Normal
import os
import time
import json
from tqdm import tqdm

from env import OilControlEnv
from common.tools import load_json_config, load_sys_config
from common.utils import *
from common.log_path import make_logpath

from solver.gurobi.solve import solve as gurobi_solver


In [3]:
env_config_dir = "./config"
env_configs = load_config(env_config_dir, 'oil_env')
env_args = get_paras_from_dict(env_configs)
env_all_conf = load_json_config("env/config.json")
env_conf = env_all_conf['Oil_Control']
env_sys_conf = load_sys_config(env_args.config_path, env_args.model_id)
env_run_dir, env_log_dir = make_logpath(env_args.scenario, env_args.algo)
solver_reward_k = 0.002
# 测试用环境
# RL+求解器双层算法用
class OilSupply_Env():
    def __init__(self):
        self.env = OilControlEnv(env_conf, env_sys_conf)
        self.reset()
        self.action_space = self.env.action_space
    
    def reset(self):
        self.step_cnt = 0
        self.state = self.env.reset()
        obs = np.array(self.env.obs2vec(self.state))
        self.obs_space = len(obs)
        return obs
    
    def step(self, vec_action):
        self.step_cnt += 1
        gurobi_action = self.RL_action2dict(vec_action)
        dict_action, solver_obj = gurobi_solver(self.env.vertices, self.env.edges, gurobi_action, self.step_cnt, 0)
        self.state, reward, done, info = self.env.step(dict_action)
        info['action'] = gurobi_action
        obs = np.array(self.env.obs2vec(self.state))
        reward = reward*2 + 15.35
#         reward -= solver_reward_k * solver_obj
#         print(dict_action)
        return obs, reward, done, info
    
    def RL_action2dict(self, action):
        action_dict = {}
        action_dict = {}
        idx = 0
        vec = action.tolist()
        for key in self.state.keys():
            if key == "transfer":
                action_dict['transfer'] = []
                for item in self.state[key]:
                    data = {}
                    data['key'] = item['key']
                    data['storage'] = dict(zip(item['materials'][1:], vec[idx:idx+len(item["storage"])-1]))
                    # data['storage'] = vec[idx:idx+len(item["storage"])]
                    idx += len(item['storage']) - 1
                    action_dict['transfer'].append(data)
            if key == "refinery":
                action_dict['refinery'] = []
                for item in self.state[key]:
                    data = {}
                    data['key'] = item['key']
                    # if isinstance(item["left_JG_budget"], list):
                    #     data['left_JG_budget'] = vec[idx:idx+len(item["left_JG_budget"])]
                    #     idx += len(item["left_JG_budget"])
                    # if isinstance(item["left_JG_budget"], int):
                    #     data['left_JG_budget'] = {'JGHY': vec[idx]}
                    #     idx += 1
                    data['storage'] = {}
                    for j in ['JGHY', "PGLE", "PLDO"]:
                        if j in item["storage"].keys():
                            data['storage'][j] = vec[idx]
                            idx += 1
                    action_dict['refinery'].append(data)
#         print(idx)
        return action_dict

# 纯RL算法用
class OilSupply_Env1():
    def __init__(self):
        self.env = OilControlEnv(env_conf, env_sys_conf)
        self.reset()
        self.action_space = self.env.action_space
    
    def reset(self):
        self.step_cnt = 0
        self.state = self.env.reset()
        obs = np.array(self.env.obs2vec(self.state))
        self.obs_space = len(obs)
        return obs
    
    def step(self, vec_action):
        self.step_cnt += 1
        action = self.env.vec2action(vec_action)
        self.state, reward, done, info = self.env.step(action)
        obs = np.array(self.env.obs2vec(self.state))
        reward = reward*2 + 15.35
#         print(dict_action)
        return obs, reward, done, info



# sample_env = OilSupply_Env()
# obs_space = sample_env.obs_space
# action_space = sample_env.action_space

In [4]:
class mlp_resblock_relu(nn.Module):
    def __init__(self, in_ch, ch, out_ch=None, block_num=3, is_relu=True):
        super().__init__()
        self.models=nn.Sequential()
        self.relus=nn.Sequential()
        self.block_num = block_num
        self.is_in = in_ch
        self.is_out = out_ch
        self.is_relu = is_relu
        
        if self.is_in:
            self.in_mlp = nn.Sequential(*[
                nn.Linear(in_ch, ch), 
                nn.LeakyReLU(0.1, inplace=True)])
        for i in range(self.block_num):
            self.models.add_module(str(i), nn.Sequential(*[
                nn.Linear(ch, ch),
                nn.LeakyReLU(0.1, inplace=True),
                nn.Linear(ch, ch)]))
            self.relus.add_module(str(i), nn.Sequential(*[
                nn.LeakyReLU(0.1, inplace=True)]))
        if self.is_out:
            self.out_mlp = nn.Sequential(*[
            nn.Linear(ch, ch), 
            nn.LeakyReLU(0.1, inplace=True),
            nn.Linear(ch, out_ch)
            ])
        if self.is_relu:
            self.relu = nn.ReLU(inplace=True)
            
    def forward(self, x):
        if self.is_in:
            x = self.in_mlp(x)
        for i in range(self.block_num):
            x0 = x
            x = self.models[i](x)
            x += x0
            x = self.relus[i](x)
        if self.is_out:
            x = self.out_mlp(x)
        if self.is_relu:
            x = self.relu(x)
        return x

MLP_CH = 1024
class Actor(nn.Module):
    def __init__(self, obs_space, action_space, is_gpu=True):
        super().__init__()
        self.is_gpu = is_gpu
        self.net = mlp_resblock_relu(in_ch=obs_space, ch=MLP_CH, out_ch=action_space, block_num=6, is_relu=True)
        self.sigma_param = nn.Parameter(torch.zeros(action_space, 1))

    def load_model(self, filename):
        map_location=lambda storage, loc:storage
        self.load_state_dict(torch.load(filename, map_location=map_location))
        print('load model!')
    
    def save_model(self, filename):
        torch.save(self.state_dict(), filename)
        print('save model!')

    def forward(self, obs, state=None, info={}):
        obs = torch.tensor(obs).float()
        if self.is_gpu:
            obs = obs.cuda()
        
        mu = self.net(obs)
        shape = [1] * len(mu.shape)
        shape[1] = -1
        sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
        return (mu,sigma), state

    
class Critic(nn.Module):
    def __init__(self, obs_space,is_gpu=True):
        super().__init__()

        self.is_gpu = is_gpu
        self.net = mlp_resblock_relu(in_ch=obs_space, ch=MLP_CH, out_ch=1, block_num=6, is_relu=False)

    def load_model(self, filename):
        map_location=lambda storage, loc:storage
        self.load_state_dict(torch.load(filename, map_location=map_location))
        print('load model!')
    
    def save_model(self, filename):
        torch.save(self.state_dict(), filename)
        print('save model!')

    def forward(self, obs, state=None, info={}):
        obs = torch.tensor(obs).float()
        if self.is_gpu:
            obs = obs.cuda()
        v = self.net(obs)

        return v
    

In [10]:
# 测RL+求解器双层方法
env1 = OilSupply_Env()
obs_space = env1.obs_space
action_space = env1.action_space
actor = Actor(is_gpu = False, obs_space=obs_space, action_space=action_space).cpu()
# critic = Critic(is_gpu = is_gpu)

load_path = None
load_path = 'save/ppo/exp2/ep12-actor.pth'

env1 = OilSupply_Env()

import warnings
warnings.filterwarnings("ignore")

for ep in range(20,20+1):
#     starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    load_path = '../model/ppo/ep%02d-actor.pth'%(ep)
    actor.load_model(load_path)
    actor.eval()
    reward = 0
    total_t = 0
    warn_cnt = 0
    obs = env1.reset()
    t1 = time.clock()
    for st in range(30):
#         starter.record()
        act,_ = actor([obs])
        action = act[0].detach().view(-1).cpu().numpy()
        # 服务器gpu常占用，需空闲时才能进行精确测时
#         torch.cuda.synchronize()
#         ender.record()
#         curr_time = starter.elapsed_time(ender) # 计算时间
#         tim1.append(curr_time)
        
#         starter.record()
        action = (np.clip(action, -1.0, 1.0)+1)/2
#         print(action)
        obs, rew, done, info = env1.step(action)
        torch.cuda.synchronize()
#         ender.record()
#         curr_time = starter.elapsed_time(ender) # 计算时间
#         tim2.append(curr_time)
        
#         total_t += curr_time
#         reward += rew*2+15.35
        reward += rew
        warn_cnt += info['split_rewards'][9]*100
#         times.append(curr_time)
#     times.append(total_t)
# times = np.array(times)
# tim1 = np.array(tim1)
# tim2 = np.array(tim2)
t2 = time.clock()
print('episode:', ep, 'reward:', reward, 'warning:', int(-warn_cnt),'use time:', t2-t1)

RuntimeError: Error(s) in loading state_dict for Actor:
	Unexpected key(s) in state_dict: "net.in_mlp.0.weight", "net.in_mlp.0.bias", "net.out_mlp.0.weight", "net.out_mlp.0.bias", "net.out_mlp.2.weight", "net.out_mlp.2.bias". 
	size mismatch for sigma_param: copying a param with shape torch.Size([109, 1]) from checkpoint, the shape in current model is torch.Size([0, 1]).

In [12]:
# 测纯RL方法
env2 = OilSupply_Env1()
obs_space = env2.obs_space
action_space = 87
actor = Actor(is_gpu = True, obs_space=obs_space, action_space=action_space).cpu()
# critic = Critic(is_gpu = is_gpu)

# load_path = None
# load_path = 'save/ppo2/exp2/ep00-actor.pth'
# actor.load_model(load_path)
# actor.eval()
# print('ok')

env2 = OilSupply_Env1()
for ep in range(10,10+1):
    load_path = '../model/ppo2/ep%02d-actor.pth'%(ep)
    actor.load_model(load_path)
    actor.eval()
    reward = 0
    total_t = 0
    warn_cnt = 0
    obs = env2.reset()
    t1 = time.clock()
    for st in range(30):
        act,_ = actor([obs])
        action = act[0].detach().view(-1).cpu().numpy()

        action = (np.clip(action, -10.0, 10.0)+10)/2

        obs, rew, done, info = env2.step(action)
#         reward += rew*2+15.35
        reward += rew
        warn_cnt += info['split_rewards'][9]*100
t2 = time.clock()
print('episode:', ep, 'reward:', reward, 'warning:', int(-warn_cnt),'use time:', t2-t1)

RuntimeError: Error(s) in loading state_dict for Actor:
	Unexpected key(s) in state_dict: "net.in_mlp.0.weight", "net.in_mlp.0.bias". 