In [45]:
import random
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import collections
from chargenv import Env


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*transitions)
        return state, action, reward, next_state, done

    def size(self):
        return len(self.buffer)


class PolicyNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        self.action_bound = action_bound

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x)) * self.action_bound


class QValueNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)




class TD3:
    ''' TD3算法 '''

    def __init__(self, state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device, policy_noise=0.2, noise_clip=0.1, policy_delay=2):
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.critic1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.critic2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        
        # 目标网络初始化并设置和策略相同的参数
        self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.target_critic1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_critic2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        
        self.target_critic1.load_state_dict(self.critic1.state_dict())
        self.target_critic2.load_state_dict(self.critic2.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())

        # 优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.01)
        self.critic_optimizer_1 = torch.optim.Adam(self.critic1.parameters(), lr=critic_lr, weight_decay=0.01)
        self.critic_optimizer_2 = torch.optim.Adam(self.critic2.parameters(), lr=critic_lr, weight_decay=0.01)

        self.gamma = gamma
        self.sigma = sigma  # 高斯噪声的标准差,均值直接设为0
        self.tau = tau  # 目标网络软更新参数
        self.action_dim = action_dim
        self.device = device

        # TD3的参数
        self.policy_noise = policy_noise  # 给目标动作添加噪声的标准差
        self.noise_clip = noise_clip  # 噪声剪裁
        self.policy_delay = policy_delay  # 策略网络的延迟更新步数
        self.action_bound = 0.5
    

    def decay_sigma(self):
        self.sigma *= 0.98

    def take_action(self, state):
        state = state.to(self.device)
        action = self.actor(state).cpu()
        # 给动作添加噪声，增加探索
        action = action.detach() + self.sigma * np.random.randn(self.action_dim)
        return action

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)

    def update(self, transition_dict, update_actor=True):
        states = torch.cat(transition_dict['states'], dim=0).to(torch.float).to(self.device)
        actions = torch.cat(transition_dict['actions'], dim=0).to(torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.cat(transition_dict['next_states'], dim=0).to(torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device)

        # 使用目标网络计算下一个状态的Q值
        next_actions = self.target_actor(next_states)

        noise = torch.clamp(torch.randn_like(next_actions) * self.policy_noise, -self.noise_clip, self.noise_clip)
        next_actions = torch.clamp(next_actions + noise, -self.action_bound, self.action_bound)

        next_q_values_1 = self.target_critic1(next_states, next_actions)
        next_q_values_2 = self.target_critic2(next_states, next_actions)
        next_q_values = torch.min(next_q_values_1, next_q_values_2)

        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        
        # Critic loss
        critic_loss_1 = torch.mean(F.mse_loss(self.critic1(states, actions), q_targets))
        critic_loss_2 = torch.mean(F.mse_loss(self.critic2(states, actions), q_targets))

        self.critic_optimizer_1.zero_grad()
        critic_loss_1.backward(retain_graph=True)
        self.critic_optimizer_1.step()

        self.critic_optimizer_2.zero_grad()
        critic_loss_2.backward()
        self.critic_optimizer_2.step()

        # 只有每隔policy_delay步才更新策略网络
        # if update_actor:
        actor_loss = -torch.mean(self.critic1(states, self.actor(states)))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 软更新目标网络
        self.soft_update(self.actor, self.target_actor)
        self.soft_update(self.critic1, self.target_critic1)
        self.soft_update(self.critic2, self.target_critic2)

        return critic_loss_1, critic_loss_2, actor_loss if update_actor else None


def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, action_bound, device):
    return_list = []
    max_reward = 0
    for i_episode in range(num_episodes):
        episode_return = 0
        state = env.reset()
        done = False

        while not done:
            action = agent.take_action(state).to(device)
            next_state, reward, done = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)
            state = next_state
            episode_return += reward

            if replay_buffer.size() > minimal_size:
                b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
                agent.update(transition_dict)

        return_list.append(episode_return)
        if max_reward < episode_return:
            torch.save(agent.actor.state_dict(), './model/train/actor_td3{}.pth'.format(arrival_rate))
            torch.save(agent.critic1.state_dict(), './model/train/critic1_td3{}.pth'.format(arrival_rate))
            torch.save(agent.critic2.state_dict(), './model/train/critic2_td3{}.pth'.format(arrival_rate))
            max_reward = episode_return

        if (i_episode + 1) % 10 == 0:
            print('episodes: %d, reward: %f' % (i_episode + 1, torch.mean(torch.tensor(return_list[-10:],dtype=float))))

    return return_list


actor_lr = 0.001
critic_lr = 0.001
num_episodes = 1000
hidden_dim = 128
gamma = 0.99
tau = 0.005
buffer_size = 40000
minimal_size = 5000
batch_size = 128
policy_noise = 0.2
noise_clip = 0.5
policy_delay = 2
sigma = 0.1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

arrival_rate = 1
data_path = './datasets/EVCD{}.csv'.format(arrival_rate)
env = Env(1, 0, 0, arrival_rate, data_path)

replay_buffer = ReplayBuffer(buffer_size)
state_dim = 4
action_dim = 1
action_bound = 0.5

agent = TD3(state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)
return_list = train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, action_bound, device)



with open('./result/td3_{}.csv'.format(arrival_rate), 'w', encoding='utf-8') as file:
    for item in return_list:
        file.write(f"{item}\n")


episodes: 10, reward: 193.188203
episodes: 20, reward: 192.271500
episodes: 30, reward: 192.271500
episodes: 40, reward: 192.271500
episodes: 50, reward: 192.271500
episodes: 60, reward: 192.271500
episodes: 70, reward: 192.271500
episodes: 80, reward: 192.271500
episodes: 90, reward: 192.271500
episodes: 100, reward: 192.271500
episodes: 110, reward: 192.271500
episodes: 120, reward: 192.271500
episodes: 130, reward: 192.271500
episodes: 140, reward: 192.271500
episodes: 150, reward: 192.271500
episodes: 160, reward: 192.271500
episodes: 170, reward: 192.271500
episodes: 180, reward: 192.271500
episodes: 190, reward: 192.271500
episodes: 200, reward: 192.271500
episodes: 210, reward: 194.906340
episodes: 220, reward: 169.894597
episodes: 230, reward: 158.759930
episodes: 240, reward: 158.823888
episodes: 250, reward: 158.855482
episodes: 260, reward: 158.856328
episodes: 270, reward: 200.205032
episodes: 280, reward: 409.964218
episodes: 290, reward: 398.619348
episodes: 300, reward: 

In [1]:
import random
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import collections
from chargenv import Env
import time




class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*transitions)
        return state, action, reward, next_state, done

    def size(self):
        return len(self.buffer)


class PolicyNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        self.action_bound = action_bound

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x)) * self.action_bound


class QValueNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)




class TD3:
    ''' TD3算法 '''

    def __init__(self, state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device, policy_noise=0.2, noise_clip=0.1, policy_delay=2):
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.critic1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.critic2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.actor.load_state_dict(torch.load("./model/train/actor_td3%d.pth" % arrival_rate))
        self.critic1.load_state_dict(torch.load("./model/train/critic1_td3%d.pth" % arrival_rate))
        self.critic2.load_state_dict(torch.load("./model/train/critic2_td3%d.pth" % arrival_rate))
        # 目标网络初始化并设置和策略相同的参数
        self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.target_critic1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_critic2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        
        self.target_critic1.load_state_dict(self.critic1.state_dict())
        self.target_critic2.load_state_dict(self.critic2.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())

        # 优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.01)
        self.critic_optimizer_1 = torch.optim.Adam(self.critic1.parameters(), lr=critic_lr, weight_decay=0.01)
        self.critic_optimizer_2 = torch.optim.Adam(self.critic2.parameters(), lr=critic_lr, weight_decay=0.01)

        self.gamma = gamma
        self.sigma = sigma  # 高斯噪声的标准差,均值直接设为0
        self.tau = tau  # 目标网络软更新参数
        self.action_dim = action_dim
        self.device = device

        # TD3的参数
        self.policy_noise = policy_noise  # 给目标动作添加噪声的标准差
        self.noise_clip = noise_clip  # 噪声剪裁
        self.policy_delay = policy_delay  # 策略网络的延迟更新步数
        self.action_bound = 0.5
    

    def decay_sigma(self):
        self.sigma *= 0.98

    def take_action(self, state):
        state = state.to(self.device)
        action = self.actor(state).cpu()
        # 给动作添加噪声，增加探索
        action = action.detach() + self.sigma * np.random.randn(self.action_dim)
        return action

   


def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, action_bound, device):
    return_list = []
    max_reward = 0
    for i_episode in range(num_episodes):
        episode_return = 0
        state = env.reset()
        done = False

        while not done:
            action = agent.take_action(state).to(device)
            next_state, reward, done = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)
            state = next_state
            episode_return += reward.item()
            



        return_list.append(episode_return)
        print(episode_return)

    return return_list


actor_lr = 0.001
critic_lr = 0.001
num_episodes = 100
hidden_dim = 128
gamma = 0.99
tau = 0.005
buffer_size = 10000
minimal_size = 1000
batch_size = 32
policy_noise = 0.2
noise_clip = 0.5
policy_delay = 2
sigma = 0.1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

arrival_rate =3
data_path = './datasets/EVCD{}.csv'.format(arrival_rate)
env = Env(1, 0, 0, arrival_rate, data_path)

replay_buffer = ReplayBuffer(buffer_size)
state_dim = 4
action_dim = 1
action_bound = 0.5

agent = TD3(state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)
start_time = time.time()
return_list = train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, action_bound, device)
end_time = time.time()
print('MR:',sum(return_list) / len(return_list),'Time:',(end_time-start_time)/num_episodes)



938.9253877438605
835.0247916020453
835.0247916020453
856.3208311833441
808.0486944951117
808.0486944951117
835.0247916020453
856.3208311833441
856.3208311833441
835.0247916020453
835.0247916020453
856.3208311833441
835.0247916020453
835.0247916020453
856.3208311833441
808.0486944951117
835.0247916020453
835.0247916020453
835.0247916020453
856.3208311833441
835.0247916020453
835.0247916020453
835.0247916020453
835.0247916020453
808.0486944951117
856.3208311833441
856.3208311833441
856.3208311833441
835.0247916020453
856.3208311833441
835.0247916020453
835.0247916020453
856.3208311833441
808.0486944951117
808.0486944951117
856.3208311833441
808.0486944951117
856.3208311833441
856.3208311833441
787.3074053563178
787.3074053563178
835.0247916020453
835.0247916020453
835.0247916020453
835.0247916020453
856.3208311833441
787.3074053563178
856.3208311833441
856.3208311833441
835.0247916020453
835.0247916020453
835.0247916020453
856.3208311833441
835.0247916020453
835.0247916020453
835.024791