In [7]:
import random
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import collections
from chargenv import Env

torch.autograd.set_detect_anomaly(True)


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*transitions)
        return state, action, reward, next_state, done

    def size(self):
        return len(self.buffer)


class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

        self.action_bound = action_bound

    def forward(self, state):
        x = F.relu(self.fc1(state))

        x = torch.tanh(self.fc2(x)) * self.action_bound
        return x


class QValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        cat = torch.cat([x, a], dim=1)  # 拼接状态和动作
        x = F.relu(self.fc1(cat))
        x = F.relu(self.fc2(x))
        return self.fc_out(x)


class DDPG:
    ''' DDPG算法 '''

    def __init__(self, state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device):
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.actor.load_state_dict(torch.load("./model/ddpg_actor_initial5.pth"))
        self.critic.load_state_dict(torch.load("./model/ddpg_critic_initial5.pth"))
        self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.target_critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # 初始化目标价值网络并设置和价值网络相同的参数
        self.target_critic.load_state_dict(self.critic.state_dict())
        # 初始化目标策略网络并设置和策略相同的参数
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.01)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=0.01)
        self.gamma = gamma
        self.sigma = sigma  # 高斯噪声的标准差,均值直接设为0
        self.tau = tau  # 目标网络软更新参数
        self.action_dim = action_dim
        self.device = device

    def normalize(self, rewards):
        """
        对奖励进行最大最小归一化
        :param rewards: 奖励的列表或数组
        :return: 归一化后的奖励
        """
        min_reward = np.min(rewards.detach().cpu().numpy())
        max_reward = np.max(rewards.detach().cpu().numpy())

        normalized_rewards = (rewards - min_reward) / (max_reward - min_reward + 1e-10)  # 添加一个小常数防止除以零
        return torch.tensor(normalized_rewards, requires_grad=True).to(device)

    # def normalize(self, rewards):
    #     """
    #     对奖励进行Z-Score标准化
    #     :param rewards: 奖励的列表或数组
    #     :return: 归一化后的奖励
    #     """
    #     rewards_mean = np.mean(rewards.detach().cpu().numpy())
    #     rewards_std = np.std(rewards.detach().cpu().numpy())
    #
    #     normalized_rewards = (rewards - rewards_mean) / (rewards_std + 1e-10)  # 添加一个小常数防止除以零
    #     return torch.tensor(normalized_rewards, requires_grad=True).to(device)

    def decay_sigma(self):
        self.sigma *= 0.98

    def take_action(self, state):
        state = state.to(self.device)
        action = self.actor(state).cpu()
        # 给动作添加噪声，增加探索

        action = action.detach() + self.sigma * torch.randn(self.action_dim)

        return action

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)

    def update(self, transition_dict):
        states = torch.cat(transition_dict['states'], dim=0).to(torch.float).to(self.device)
        actions = torch.cat(transition_dict['actions'], dim=0).to(torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        # rewards = self.normalize(rewards)
        next_states = torch.cat(transition_dict['next_states'], dim=0).to(torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device)

        next_q_values = self.target_critic(next_states, self.target_actor(next_states))
        # next_q_values = self.normalize(next_q_values)
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        critic_loss = torch.mean(F.mse_loss(self.critic(states, actions), q_targets))
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -torch.mean(self.critic(states, self.actor(states)))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.actor, self.target_actor)  # 软更新策略网络
        self.soft_update(self.critic, self.target_critic)  # 软更新价值网络
        return critic_loss, actor_loss


def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, device):
    return_list = []
    cl, al = 0, 0
    max_reward = 0
    for i_episode in range(num_episodes):
        episode_return = 0
        state = env.reset()
        agent.decay_sigma()
        # state = torch.tensor(state, dtype=torch.float32).to(device)
        done = False

        while not done:
            action = agent.take_action(state).to(device)
            next_state, reward, done = env.step(torch.transpose(action, 0, 1))
            # next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
            replay_buffer.add(state, action, reward, next_state, done)
            state = next_state
            # print(env.price, reward)

            episode_return += reward.item()
            if replay_buffer.size() > minimal_size:
                b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
                agent.update(transition_dict)
                # agent.update(transition_dict)

                # print(cl, al)
        return_list.append(episode_return)
        # if  episode_return<150:
            # torch.save(agent.actor.state_dict(), './model/ddpg_actor_initial{}.pth'.format(arrival_rate))
            # torch.save(agent.critic.state_dict(), './model/ddpg_critic_initial{}.pth'.format(arrival_rate))
        if max_reward < episode_return:
            # torch.save(agent.actor.state_dict(), './model/train/actor_ddpg{}.pth'.format(arrival_rate))
            # torch.save(agent.critic.state_dict(), './model/train/critic_ddpg{}.pth'.format(arrival_rate))
            # torch.save(agent.target_actor.state_dict(), './model/train/target_actor_ddpg{}.pth'.format(arrival_rate))
            # torch.save(agent.target_critic.state_dict(), './model/train/target_critic_ddpg{}.pth'.format(arrival_rate))
            max_reward = episode_return
        if (i_episode + 1) % 10 == 0:
            print(
                'episodes: %d, reward: %f' % (i_episode + 1, torch.mean(torch.tensor(return_list[-10:], dtype=float))))
            # print(env.price)

        # print('---------------------------------------------')
    return return_list


actor_lr = 0.005
critic_lr = 0.005
num_episodes = 1000

hidden_dim = 128
gamma = 0.98
tau = 0.001  # 软更新参数
buffer_size = 40000

minimal_size = 5000
batch_size = 128
sigma = 0.5  # 高斯噪声标准差
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


arrival_rate = 5
data_path = './datasets/EVCD{}.csv'.format(arrival_rate)
env = Env(1, 0, 0, arrival_rate, data_path)

replay_buffer = ReplayBuffer(buffer_size)
state_dim = 4

action_dim = env.n_cs
action_bound = 0.5  # 动作最大值
agent = DDPG(state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)

return_list = train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, device)

# episodes_list = list(range(len(return_list)))
# plt.plot(episodes_list, return_list)
# plt.xlabel('Episodes')
# plt.ylabel('Returns')
# plt.title('DDPG on DY')
# plt.show()

with open('./result/ddpg{}.csv'.format(arrival_rate), 'w', encoding='utf-8') as file:
    for item in return_list:
        file.write(f"{item}\n")



episodes: 10, reward: 177.081497
episodes: 20, reward: 109.968953
episodes: 30, reward: 208.811878
episodes: 40, reward: 188.276413
episodes: 50, reward: 152.703500
episodes: 60, reward: 135.768466
episodes: 70, reward: 148.563847
episodes: 80, reward: 147.357442
episodes: 90, reward: 122.924521
episodes: 100, reward: 123.486603
episodes: 110, reward: 139.247587
episodes: 120, reward: 137.008859
episodes: 130, reward: 140.488387
episodes: 140, reward: 132.708817
episodes: 150, reward: 131.878316
episodes: 160, reward: 126.551942
episodes: 170, reward: 131.396768
episodes: 180, reward: 130.478831
episodes: 190, reward: 135.964710
episodes: 200, reward: 138.747390
episodes: 210, reward: 128.290901
episodes: 220, reward: 599.137426
episodes: 230, reward: 956.639980
episodes: 240, reward: 979.443796
episodes: 250, reward: 1007.066318
episodes: 260, reward: 971.005488
episodes: 270, reward: 1017.417157
episodes: 280, reward: 1048.362604
episodes: 290, reward: 1052.698631
episodes: 300, rewa

KeyboardInterrupt: 

In [1]:
import random
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import collections
from chargenv import Env
import time
import os

torch.autograd.set_detect_anomaly(True)


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*transitions)
        return state, action, reward, next_state, done

    def size(self):
        return len(self.buffer)


class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

        self.action_bound = action_bound

    def forward(self, state):
        x = F.relu(self.fc1(state))

        x = torch.tanh(self.fc2(x)) * self.action_bound
        return x


class QValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        cat = torch.cat([x, a], dim=1)  # 拼接状态和动作
        x = F.relu(self.fc1(cat))
        x = F.relu(self.fc2(x))
        return self.fc_out(x)


class DDPG:
    ''' DDPG算法 '''

    def __init__(self, state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device):
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_actor = PolicyNet(state_dim, hidden_dim, action_dim, action_bound).to(device)
        self.target_critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.actor.load_state_dict(torch.load("./model/train/actor_ddpg%d.pth" % arrival_rate))
        self.critic.load_state_dict(torch.load("./model/train/critic_ddpg%d.pth" % arrival_rate))
        self.target_actor.load_state_dict(torch.load("./model/train/target_actor_ddpg%d.pth" % arrival_rate))
        self.target_critic.load_state_dict(torch.load("./model/train/target_critic_ddpg%d.pth" % arrival_rate))
        # # 初始化目标价值网络并设置和价值网络相同的参数
        self.target_critic.load_state_dict(self.critic.state_dict())
        # 初始化目标策略网络并设置和策略相同的参数
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.01)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=0.01)
        self.gamma = gamma
        self.sigma = sigma  # 高斯噪声的标准差,均值直接设为0
        self.tau = tau  # 目标网络软更新参数
        self.action_dim = action_dim
        self.device = device

    def normalize(self, rewards):
        """
        对奖励进行最大最小归一化
        :param rewards: 奖励的列表或数组
        :return: 归一化后的奖励
        """
        min_reward = np.min(rewards.detach().cpu().numpy())
        max_reward = np.max(rewards.detach().cpu().numpy())

        normalized_rewards = (rewards - min_reward) / (max_reward - min_reward + 1e-10)  # 添加一个小常数防止除以零
        return torch.tensor(normalized_rewards, requires_grad=True).to(device)

    # def normalize(self, rewards):
    #     """
    #     对奖励进行Z-Score标准化
    #     :param rewards: 奖励的列表或数组
    #     :return: 归一化后的奖励
    #     """
    #     rewards_mean = np.mean(rewards.detach().cpu().numpy())
    #     rewards_std = np.std(rewards.detach().cpu().numpy())
    #
    #     normalized_rewards = (rewards - rewards_mean) / (rewards_std + 1e-10)  # 添加一个小常数防止除以零
    #     return torch.tensor(normalized_rewards, requires_grad=True).to(device)

    def decay_sigma(self):
        self.sigma *= 0.98

    def take_action(self, state):
        state = state.to(self.device)
        action = self.actor(state).cpu()
        # 给动作添加噪声，增加探索
        action = action.detach() + self.sigma * np.random.randn(self.action_dim)
        return action

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)

    def update(self, transition_dict):
        states = torch.cat(transition_dict['states'], dim=0).to(torch.float).to(self.device)
        actions = torch.cat(transition_dict['actions'], dim=0).to(torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        # rewards = self.normalize(rewards)
        next_states = torch.cat(transition_dict['next_states'], dim=0).to(torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device)

        next_q_values = self.target_critic(next_states, self.target_actor(next_states))
        # next_q_values = self.normalize(next_q_values)
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        critic_loss = torch.mean(F.mse_loss(self.critic(states, actions), q_targets))
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -torch.mean(self.critic(states, self.actor(states)))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.actor, self.target_actor)  # 软更新策略网络
        self.soft_update(self.critic, self.target_critic)  # 软更新价值网络
        return critic_loss, actor_loss


def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, device):
    return_list = []
    cl, al = 0, 0
    max_reward = 0
    for i_episode in range(num_episodes):
        episode_return = 0
        state = env.reset()
        agent.decay_sigma()
        # state = torch.tensor(state, dtype=torch.float32).to(device)
        done = False

        while not done:
            action = agent.take_action(state).to(device)
            next_state, reward, done = env.step(torch.transpose(action, 0, 1))
            # next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
            # replay_buffer.add(state, action, reward, next_state, done)
            state = next_state
            # print(env.price, reward)

            episode_return += reward.item()
            # if replay_buffer.size() > minimal_size:
            #     b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
            #     transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
            #     agent.update(transition_dict)
        return_list.append(episode_return)
        # if max_reward < episode_return:
        #     max_reward = episode_return
        
        print(episode_return)
        # print('---------------------------------------------')
    return return_list


actor_lr = 0.001
critic_lr = 0.001
num_episodes = 100

hidden_dim = 128
gamma = 0.98
tau = 0.001  # 软更新参数
buffer_size = 10000

minimal_size = 200
batch_size = 128
sigma = 0  # 高斯噪声标准差
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



arrival_rate = 1
data_path = './datasets/EVCD{}.csv'.format(arrival_rate)
env = Env(1, 0, 0, arrival_rate, data_path)


replay_buffer = ReplayBuffer(buffer_size)
state_dim = 4

action_dim = env.n_cs
action_bound = 0.5  # 动作最大值
agent = DDPG(state_dim, hidden_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)
start_time = time.time()
return_list = train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, device)
end_time = time.time()
print('MR:',sum(return_list) / len(return_list),'Time:',(end_time-start_time)/num_episodes)


# with open('./result/test/DDPG{}.csv'.format(arrival_rate), 'w', encoding='utf-8') as file:
#     for item in return_list:
#         file.write(f"{item}\n")
        
# os.makedirs(os.path.dirname('./result/test/DDPG{}.txt'.format(arrival_rate)), exist_ok=True)

# 写入文件
# with open('./result/test/DDPG{}.txt'.format(arrival_rate), 'w', encoding='utf-8') as file:
#     file.write(f'Test_mode: DDPG{arrival_rate}, MR: {sum(return_list) / len(return_list)}, Time: {(end_time-start_time)/num_episodes}')



1003.2810235023499
848.2722661495209
882.0003035068512
856.2578642368317
856.2578642368317
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
882.0003035068512
856.2578642368317
856.2578642368317
882.0003035068512
856.2578642368317
882.0003035068512
874.0147054195404
882.0003035068512
882.0003035068512
882.0003035068512
856.2578642368317
882.0003035068512
856.2578642368317
856.2578642368317
882.0003035068512
882.0003035068512
874.0147054195404
882.0003035068512
874.0147054195404
870.6365988254547
882.0003035068512
882.0003035068512
896.3790380954742
882.0003035068512


KeyboardInterrupt: 