In [None]:
import gym, torch, numpy as np, torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import tianshou as ts
from copy import deepcopy
from tianshou.env import DummyVectorEnv
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
from torch.distributions import Independent, Normal
import os
import time
import json
from tqdm import tqdm

from env import OilControlEnv
from common.tools import load_json_config, load_sys_config
from common.utils import *
from common.log_path import make_logpath

from solver.gurobi.solve import solve as gurobi_solver

In [None]:
"""
纯强化学习模型
强化学习算法为ppo
算法库thu-tianshou
"""

In [None]:
config = 'OilSupply'
lr, epoch, batch_size = 1e-5, 10, 1024
train_num, test_num = 128, 128
gamma, lr_decay = 0.9, None
buffer_size = 1000000
buffer_alpha, buffer_beta = 0.6, 0.4
eps_train, eps_test = 0.1, 0.00
step_per_epoch, episode_per_collect = 30*train_num*300, train_num
writer = SummaryWriter('log/ppo2')  # tensorboard is also supported!
logger = ts.utils.BasicLogger(writer)
is_gpu = True
#ppo
gae_lambda, max_grad_norm = 0.95, 0.5
vf_coef, ent_coef = 0.25, 0.0
rew_norm, action_scaling = False, False
bound_action_method = "clip"
eps_clip, value_clip = 0.2, False
repeat_per_collect = 2
dual_clip, norm_adv = None, 0.0
recompute_adv = 0

solver_reward_k = 0.002

In [None]:
env_config_dir = "./config"
env_configs = load_config(env_config_dir, 'oil_env')
env_args = get_paras_from_dict(env_configs)
env_all_conf = load_json_config("env/config.json")
env_conf = env_all_conf['Oil_Control']
env_sys_conf = load_sys_config(env_args.config_path, env_args.model_id)
env_run_dir, env_log_dir = make_logpath(env_args.scenario, env_args.algo)

class OilSupply_Env():
    def __init__(self):
        self.env = OilControlEnv(env_conf, env_sys_conf)
        self.reset()
        self.action_space = self.env.action_space
    
    def reset(self):
        self.step_cnt = 0
        self.state = self.env.reset()
        obs = np.array(self.env.obs2vec(self.state))
        self.obs_space = len(obs)
        return obs
    
    def step(self, vec_action):
        self.step_cnt += 1
        action = self.env.vec2action(vec_action)
        self.state, reward, done, info = self.env.step(action)
        obs = np.array(self.env.obs2vec(self.state))
        reward = reward*2 + 15.35
#         print(dict_action)
        return obs, reward, done, info


sample_env = OilSupply_Env()
obs_space = sample_env.obs_space
action_space = 87

In [None]:
class mlp_resblock_relu(nn.Module):
    def __init__(self, in_ch, ch, out_ch=None, block_num=3, is_relu=True):
        super().__init__()
        self.models=nn.Sequential()
        self.relus=nn.Sequential()
        self.block_num = block_num
        self.is_in = in_ch
        self.is_out = out_ch
        self.is_relu = is_relu
        
        if self.is_in:
            self.in_mlp = nn.Sequential(*[
                nn.Linear(in_ch, ch), 
                nn.LeakyReLU(0.1, inplace=True)])
        for i in range(self.block_num):
            self.models.add_module(str(i), nn.Sequential(*[
                nn.Linear(ch, ch),
                nn.LeakyReLU(0.1, inplace=True),
                nn.Linear(ch, ch)]))
            self.relus.add_module(str(i), nn.Sequential(*[
                nn.LeakyReLU(0.1, inplace=True)]))
        if self.is_out:
            self.out_mlp = nn.Sequential(*[
            nn.Linear(ch, ch), 
            nn.LeakyReLU(0.1, inplace=True),
            nn.Linear(ch, out_ch)
            ])
        if self.is_relu:
            self.relu = nn.ReLU(inplace=True)
            
    def forward(self, x):
        if self.is_in:
            x = self.in_mlp(x)
        for i in range(self.block_num):
            x0 = x
            x = self.models[i](x)
            x += x0
            x = self.relus[i](x)
        if self.is_out:
            x = self.out_mlp(x)
        if self.is_relu:
            x = self.relu(x)
        return x

MLP_CH = 1024
class Actor(nn.Module):
    def __init__(self, is_gpu=True):
        super().__init__()
        self.is_gpu = is_gpu
        self.net = mlp_resblock_relu(in_ch=obs_space, ch=MLP_CH, out_ch=action_space, block_num=6, is_relu=True)
        self.sigma_param = nn.Parameter(torch.zeros(action_space, 1))

    def load_model(self, filename):
        map_location=lambda storage, loc:storage
        self.load_state_dict(torch.load(filename, map_location=map_location))
        print('load model!')
    
    def save_model(self, filename):
        torch.save(self.state_dict(), filename)
        print('save model!')

    def forward(self, obs, state=None, info={}):
        obs = torch.tensor(obs).float()
        if self.is_gpu:
            obs = obs.cuda()
        
        mu = self.net(obs)
        shape = [1] * len(mu.shape)
        shape[1] = -1
        sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
        return (mu,sigma), state

    
class Critic(nn.Module):
    def __init__(self, is_gpu=True):
        super().__init__()

        self.is_gpu = is_gpu
        self.net = mlp_resblock_relu(in_ch=obs_space, ch=MLP_CH, out_ch=1, block_num=6, is_relu=False)

    def load_model(self, filename):
        map_location=lambda storage, loc:storage
        self.load_state_dict(torch.load(filename, map_location=map_location))
        print('load model!')
    
    def save_model(self, filename):
        torch.save(self.state_dict(), filename)
        print('save model!')

    def forward(self, obs, state=None, info={}):
        obs = torch.tensor(obs).float()
        if self.is_gpu:
            obs = obs.cuda()
        v = self.net(obs)

        return v
    

In [None]:
actor = Actor(is_gpu = is_gpu)
critic = Critic(is_gpu = is_gpu)

load_path = None
# load_path = 'save/ppo/exp1/ep03-actor.pth'
# actor.load_model(load_path)
# load_path = 'save/ppo/exp1/ep03-critic.pth'
# critic.load_model(load_path)

if is_gpu:
    actor.cuda()
    critic.cuda()

    
from tianshou.utils.net.common import ActorCritic
actor_critic = ActorCritic(actor, critic)

# if load_path is None:
#     # orthogonal initialization
#     for m in actor_critic.modules():
#         if isinstance(m, torch.nn.Linear):
#             torch.nn.init.orthogonal_(m.weight)
#             torch.nn.init.zeros_(m.bias)
#         if isinstance(m, torch.nn.Conv2d):
#             torch.nn.init.orthogonal_(m.weight)
#             torch.nn.init.zeros_(m.bias)

optim = torch.optim.Adam(actor_critic.parameters(), lr=lr)

In [None]:
def dist(*logits):
    return Independent(Normal(*logits), 1)


action_space = gym.spaces.Box(0.0,10.0,(action_space,))

if lr_decay:
    lr_scheduler = LambdaLR(
        optim, lr_lambda=lambda epoch: lr_decay**(epoch-1)
    )
else:
    lr_scheduler = None

policy = ts.policy.PPOPolicy(actor, critic, optim, dist,
        discount_factor=gamma, max_grad_norm=max_grad_norm,
        eps_clip=eps_clip, vf_coef=vf_coef,
        ent_coef=ent_coef, reward_normalization=rew_norm,
        advantage_normalization=norm_adv, recompute_advantage=recompute_adv,
        dual_clip=dual_clip, value_clip=value_clip,
        gae_lambda=gae_lambda, action_space=action_space,
        lr_scheduler=lr_scheduler,
    )

In [None]:

# you can also try with SubprocVectorEnv
train_envs = DummyVectorEnv([lambda: OilSupply_Env() for _ in range(train_num)])
test_envs = DummyVectorEnv([lambda: OilSupply_Env() for _ in range(test_num)]) 

# buffer = ts.data.PrioritizedVectorReplayBuffer(buffer_size, train_num, alpha=buffer_alpha, beta=buffer_beta)
buffer = ts.data.VectorReplayBuffer(buffer_size, train_num)
train_collector = ts.data.Collector(policy, train_envs, buffer)
test_collector = ts.data.Collector(policy, test_envs)  # because DQN uses epsilon-greedy method
train_collector.collect(n_episode=1)
# # a,b = train_collector.collect(n_episode=1)

def save_best_fn (policy):
#     policy.actor.save_model('save/ppo/exp1/best-actor.pth')
#     policy.critic.save_model('save/ppo/exp1/best-critic.pth')
    pass

def test_fn(epoch, env_step):
    policy.actor.save_model('save/ppo2/exp2/ep%02d-actor.pth'%(epoch))
    policy.critic.save_model('save/ppo2/exp2/ep%02d-critic.pth'%(epoch))
#     pass

def train_fn(epoch, env_step):
    pass
    # policy.set_eps(eps_train)

def reward_metric(rews):
    return rews

result = ts.trainer.onpolicy_trainer(
        policy, train_collector, test_collector, epoch, step_per_epoch,
        repeat_per_collect, test_num, batch_size,
        episode_per_collect=episode_per_collect, save_best_fn =save_best_fn , logger=logger,
        test_fn = test_fn, test_in_train=False)
